## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo=TRUE,eval=FALSE) #setwd("~/Desktop/cornet") #utils::install.packages(pkgs=c("devtools","missRanger","xtable","randomForest","MLmetrics")) #devtools::install_github("rauschenberger/cornet") ## ----process------------------------------------------------------------------ # # features # X <- read.csv("data/PPMI_Baseline_Data_02Jul2018.csv",row.names="PATNO",na.strings=c(".","")) # X <- X[X$APPRDX==1,] # Parkinson's disease # X[c("SITE","APPRDX","EVENT_ID","symptom5_comment")] <- NULL # 100*mean(is.na(X)) # proportion missingness # #x <- mice::complete(data=mice::mice(X,m=10,maxit=5,method="pmm",seed=1),action="all") # low-dimensional # x <- lapply(seq_len(10),function(x) missRanger::missRanger(data=X,pmm.k=3, # num.trees=100,verbose=0,seed=1)) # high-dimensional # x <- lapply(x,function(x) model.matrix(~.-1,data=x)) # # # outcome # Y <- read.csv("data/PPMI_Year_1-3_Data_02Jul2018.csv",na.strings=".") # Y <- Y[Y$APPRDX==1 & Y$EVENT_ID %in% c("V04","V06","V08"),] # Y <- Y[,c("EVENT_ID","PATNO","moca")] # Y <- reshape(Y,idvar="PATNO",timevar="EVENT_ID",direction="wide") # rownames(Y) <- Y$PATNO; Y$PATNO <- NULL # # # overlap # names <- intersect(rownames(X),rownames(Y)) # Y <- Y[names,]; x <- lapply(x,function(x) x[names,]) # save(Y,x,file="data/processed_data.RData") # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), # sessioninfo::session_info()),con="data/info_data.txt") # # # dictionary # rm(list=ls()) # info <- read.csv("data/PPMI_Data_Dictionary_for_Baseline_Dataset_Jul2018.csv",nrows=238) # info <- info[!info$Variable %in% c("","PATNO","SITE","APPRDX","EVENT_ID","symptom5_comment"),] # info <- paste(paste0("\\item \\textit{",info$Variable,"}: ",info$Description),collapse=" ") # info <- gsub(pattern=" \\(OFF\\)",replacement="",x=info) # info <- gsub(pattern="_",replacement="\\_",x=info,fixed=TRUE) # info <- gsub(pattern="&",replacement="\\\\&",x=info) # cat(info) ## ----analyse------------------------------------------------------------------ # load("data/processed_data.RData",verbose=TRUE) # # colSums(!is.na(Y)) # sample size # round(100*colMeans(Y<25.5,na.rm=TRUE),1) # proportion impairment # # # --- Cross-validating models. --- # names <- paste0(rep(c("lasso","ridge"),each=3),seq_len(3)) # change to 3! # loss <- fit <- p.log <- p.lin <- list() # for(i in seq_along(x)){ # loss[[i]] <- fit[[i]] <- p.log[[i]] <- p.lin[[i]] <- list() # cat("i =",i,"\n") # for(j in seq_along(names)){ # cat("j =",j," ") # alpha <- 1*(substr(names[j],start=1,stop=5)=="lasso") # index <- as.numeric(substr(names[j],start=6,stop=6)) # y <- Y[,index] # cond <- !is.na(y) # set.seed(i) # loss[[i]][[names[j]]] <- cornet::cv.cornet(y=y[cond],cutoff=25.5, # X=x[[i]][cond,],alpha=alpha,rf=(alpha==1),xgboost=(alpha==1)) # set.seed(i) # fit[[i]][[names[j]]] <- cornet::cornet(y=y[cond],cutoff=25.5, # X=x[[i]][cond,],alpha=alpha) # set.seed(i) # temp <- replicate(n=50,expr=cornet:::.test(y=y[cond],cutoff=25.5,X=x[[i]][cond,],alpha=alpha)) # p.log[[i]][[names[j]]] <- median(unlist(temp["log",])) # p.lin[[i]][[names[j]]] <- median(unlist(temp["lin",])) # } # cat("\n") # } # # save(loss,fit,p.log,p.lin,file="results/application.RData") # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), # sessioninfo::session_info()),con="results/info_app.txt") ## ----table_test--------------------------------------------------------------- # load("results/application.RData",verbose=TRUE) # # k <- "binomial" # compare: k <- "gaussian # # names <- c(paste0("lasso",1:3),paste0("ridge",1:3)) # frame <- data.frame(row.names=names) # for(i in names){ # # # deviance # dev <- as.data.frame(t(sapply(loss,function(x) x[[i]]$deviance))) # dec <- (dev$combined-dev[[k]])/dev[[k]] # # change in percent # frame[i,"dev.min"] <- round(100*min(dec),digits=1) # frame[i,"dev.max"] <- round(100*max(dec),digits=1) # # # proportion with improvement # frame[i,"dev.num"] <- sum(dev$combined < dev[[k]]) # # # significance based on multi-split # if(k=="binomial"){ # pvalue <- sapply(p.log,function(x) x[[i]]) # } # if(k=="gaussian"){ # pvalue <- sapply(p.lin,function(x) x[[i]]) # } # frame[i,"pval.min"] <- round(min(pvalue),digits=3) # frame[i,"pval.max"] <- round(max(pvalue),digits=3) # # # quantiles weight parameter # q <- round(quantile(sapply(fit,function(x) x[[i]]$pi.min),probs=c(0,0.5,1)),digits=2) # frame[i,"pi.min"] <- q[1] # frame[i,"pi.med"] <- q[2] # frame[i,"pi.max"] <- q[3] # # # quantiles scale parameter # q <- round(quantile(sapply(fit,function(x) x[[i]]$sigma.min),probs=c(0,0.5,1)),digits=2) # frame[i,"sd.min"] <- q[1] # frame[i,"sd.med"] <- q[2] # frame[i,"sd.max"] <- q[3] # } # # # presentation # frame <- format(frame) # rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ", # substr(x=rownames(frame),start=6,stop=6)) # colnames(frame) <- gsub(pattern="dev.",replacement="\\\\delta_{\\\\text{",x=colnames(frame)) # colnames(frame) <- gsub(pattern="pval.",replacement="p_{\\\\text{",x=colnames(frame)) # colnames(frame) <- gsub(pattern="pi.",replacement="\\\\pi_{\\\\text{",x=colnames(frame)) # colnames(frame) <- gsub(pattern="sd.",replacement="\\\\sigma_{\\\\text{",x=colnames(frame)) # colnames(frame) <- paste0("$",colnames(frame),"}}$") # # xtable <- xtable::xtable(frame,align="r|rrc|cc|ccc|ccc") # xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE) ## ----table_change------------------------------------------------------------- # load("results/application.RData",verbose=TRUE) # # names <- c(paste0("lasso",1:3),paste0("ridge",1:3)) # type <- c("deviance","class","mse","mae","auc","prauc") # # k <- "binomial" # # frame <- matrix(NA,nrow=length(names),ncol=length(type),dimnames=list(names,type)) # # for(i in names){ # for(j in type){ # value <- as.data.frame(t(sapply(loss,function(x) x[[i]][[j]]))) # change <- 100*(value$combined-value[[k]])/value[[k]] # frame[i,j] <- median(change) # } # } # # frame <- format(round(frame,digits=1)) # frame <- gsub(pattern=" ",replacement="+",x=frame) # colnames(frame) <- sapply(colnames(frame),function(x) switch(x,class="\\textsc{mcr}",mse="\\textsc{mse}",mae="\\textsc{mae}",auc="\\textsc{roc-auc}",prauc="\\textsc{pr-auc}",x)) # colnames(frame) <- paste0("$\\Delta$",colnames(frame)) # rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ", # substr(x=rownames(frame),start=6,stop=6)) # xtable <- xtable::xtable(frame,align="r|cccccc") # xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE) ## ----table_other-------------------------------------------------------------- # load("results/application.RData",verbose=TRUE) # # table <- numeric() # for(i in 1:3){ # lasso <- apply(sapply(loss,function(x) x[[paste0("lasso",i)]]$deviance),1,median) # names(lasso)[names(lasso)=="binomial"] <- "logistic lasso regression" # names(lasso)[names(lasso)=="combined"] <- "combined lasso regression" # ridge <- apply(sapply(loss,function(x) x[[paste0("ridge",i)]]$deviance),1,median) # names(ridge)[names(ridge)=="binomial"] <- "logistic ridge regression" # names(ridge)[names(ridge)=="combined"] <- "combined ridge regression" # table <- cbind(table,c(lasso[c("logistic lasso regression","combined lasso regression")],ridge[c("logistic ridge regression","combined ridge regression")],lasso["rf"],lasso["xgboost"])) # rownames(table)[rownames(table)=="rf"] <- "\\texttt{randomForest}" # rownames(table)[rownames(table)=="xgboost"] <- "\\texttt{xgboost}" # } # colnames(table) <- paste("year",1:3) # rownames(table) <- paste("\\footnotesize",rownames(table)) # xtable <- xtable::xtable(table) # xtable::print.xtable(xtable,comment=FALSE,hline.after=c(0,2,4),sanitize.text.function=identity) # ## ----figure_MAP--------------------------------------------------------------- # load("results/application.RData",verbose=TRUE) # sum <- fit[[1]]$lasso1 # sum$cvm <- Reduce("+",lapply(fit,function(x) x$lasso1$cvm)) # sum$sigma.min <- sapply(fit,function(x) x$lasso1$sigma.min) # sum$pi.min <- sapply(fit,function(x) x$lasso1$pi.min) # # grDevices::pdf("manuscript/figure_MAP.pdf",width=4,height=3) # # graphics::par(mar=c(4,4,0.5,0.5)) # cornet:::plot.cornet(sum) # # grDevices::dev.off() ## ----figure_TFN--------------------------------------------------------------- # rm(list=ls()) # # sigma <- c(1,2,3); cutoff <- 25.5 # x <- seq(from=20,to=30,length.out=100) # # grDevices::pdf("manuscript/figure_TFN.pdf",width=4,height=3) # # graphics::par(mar=c(4,4,0.5,0.5)) # graphics::plot.new() # graphics::plot.window(xlim=range(x),ylim=c(0,1)) # graphics::box() # graphics::axis(side=1) # graphics::axis(side=2) # graphics::title(xlab=expression(hat(y)),ylab=expression(Phi(hat(y),mu,sigma^2)),line=2.5) # graphics::abline(h=0.5,lty=2,col="grey") # graphics::abline(v=cutoff,lty=2,col="grey") # # lty <- c(2,1,3); lwd <- c(1,1,2) # lty <- c("dashed","solid","dotted") # for(i in seq_along(sigma)){ # p <- stats::pnorm(q=x,mean=cutoff,sd=sigma[i]) # graphics::lines(x=x,y=p,lty=lty[i],lwd=lwd[i]) # } # # graphics::text(x=cutoff,y=0.40,labels=bquote(mu==.(cutoff)),pos=4) # legend <- sapply(sigma,function(x) as.expression(bquote(sigma == .(x)))) # graphics::legend(x="topleft",legend=legend,lty=lty,bty="n",lwd=lwd) # # grDevices::dev.off() ## ----ordinal,eval=FALSE------------------------------------------------------- # cv.cornet <- function (y, cutoff, X, alpha = 1, nfolds.ext = 5, nfolds.int = 10, foldid.ext = NULL, foldid.int = NULL, type.measure = "deviance", ordinal = FALSE,...) { # z <- 1 * (y > cutoff) # if (is.null(foldid.ext)) { # foldid.ext <- palasso:::.folds(y = z, nfolds = nfolds.ext) # } else { # nfolds.ext <- length(unique(foldid.ext)) # } # cols <- c("intercept", "binomial", "combined") # ### trial start ### # if(ordinal){cols <- c(cols,"ordinal")} # ### trial end ### # pred <- matrix(data = NA, nrow = length(y), ncol = length(cols), # dimnames = list(NULL, cols)) # for (i in seq_len(nfolds.ext)) { # y0 <- y[foldid.ext != i] # z0 <- z[foldid.ext != i] # X0 <- X[foldid.ext != i, ] # X1 <- X[foldid.ext == i, ] # if (is.null(foldid.int)) { # foldid <- palasso:::.folds(y = z0, nfolds = nfolds.int) # } else { # foldid <- foldid.int[foldid.ext != i] # } # fit <- cornet::cornet(y = y0, cutoff = cutoff, X = X0, # alpha = alpha, type.measure = type.measure, foldid = foldid, # ...) # tryCatch(expr = plot.cornet(fit), error = function(x) NULL) # temp <- predict.cornet(fit, newx = X1) # if (any(temp < 0 | temp > 1)) { # stop("Outside unit interval.", call. = FALSE) # } # model <- colnames(temp) # for (j in seq_along(model)) { # pred[foldid.ext == i, model[j]] <- temp[[model[j]]] # } # if(ordinal){ # ### trial start ### # #browser() # y0_ord <- as.factor(y0) # fit <- ordinalNet::ordinalNet(x=X0,y=y0_ord,alpha=alpha) # pred_ord <- predict(fit,newx=X1) # above <- as.numeric(levels(y0_ord))>cutoff # pred[foldid.ext ==i,"ordinal"] <- rowSums(pred_ord[,above]) # } ### trial end ### # } # type <- c("deviance", "class", "mse", "mae", "auc") # loss <- lapply(X = type, FUN = function(x) palasso:::.loss(y = z, # fit = pred, family = "binomial", type.measure = x, # foldid = foldid.ext)[[1]]) # names(loss) <- type # loss <- lapply(loss, function(x) signif(x, digits = 6)) # return(loss) # } # predict.cornet <- cornet:::predict.cornet # # loss <- list() # for(i in seq_along(Y)){ # load("data/processed_data.RData",verbose=TRUE) # cond <- !is.na(Y[[i]]) # loss[[i]] <- cv.cornet(y=Y[[i]][cond],X=x[[1]][cond,],cutoff=25.5,ordinal=TRUE) # } # sapply(loss,function(x) x$deviance) # # # ### Ordinal regression with two different packages. # # # # load("data/processed_data.RData",verbose=TRUE) # # cond <- !is.na(Y$moca.V04) # # y <- as.factor(x=Y$moca.V04[cond]) # # x <- x[[1]][cond,] # # # # # ordinalNet # # model <- ordinalNet::ordinalNet(x=x,y=y) # # y_hat <- predict(model) # # below <- as.numeric(levels(y))<=25.5 # # above <- as.numeric(levels(y))>25.5 # # b1 <- rowSums(y_hat[,below]) # # a1 <- rowSums(y_hat[,above]) # # # # # glmnetcr # # model <- glmnetcr::glmnetcr(x=x,y=y) # # s <- glmnetcr::select.glmnetcr(model) # # y_hat <- predict(model)$probs[,,s] # # below <- as.numeric(colnames(y_hat))<=25.5 # # above <- as.numeric(colnames(y_hat))>25.5 # # b2 <- rowSums(y_hat[,below]) # # a2 <- rowSums(y_hat[,above]) # # # # cor(a1,a2) # # cor(b1,b2)