## ----eval,eval=TRUE-----------------------------------------------------------
knitr::opts_chunk$set(echo=TRUE,eval=FALSE)
sim.app <- FALSE # reproduce simulation and application?
fig.tab <- FALSE # reproduce figures and tables?

## ----setup,eval=sim.app|fig.tab-----------------------------------------------
# path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
# #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)
# 
# dir <- c("results","manuscript","package/R/functions.R")
# for(i in seq_along(dir)){
#   if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
#     stop(paste0("Require folder/file'",dir[i],"'."))
#   }
# }
# source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.
# 
# inst <- rownames(utils::installed.packages())
# pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     utils::install.packages(pkgs[i])
#   }
# }
# pkgs <- c("recount3","edgeR")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     BiocManager::install(pkgs[i])
#   }
# }
# 
# blue <- "blue"; red <- "red"
# 
# if(exists("sim.app")&exists("fig.tab")){
#   if(!sim.app&fig.tab){
#     files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
#     for(i in seq_along(files)){
#       if(!file.exists(file.path(path,"results",files[i]))){
#         stop("File",files[i],"is missing.")
#       }
#     }
#   }
# }

## ----method_figure,eval=fig.tab-----------------------------------------------
# #<<setup>>
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_flow.eps"),width=6,height=2.5,horizontal=FALSE,onefile=FALSE,paper="special")
# graphics::par(mfrow=c(1,1),mar=c(0,0,0,0))
# graphics::plot.new()
# graphics::plot.window(xlim=c(-0.2,1.0),ylim=c(0.0,1.0))
# cex <- 0.8
# 
# pos <- data.frame(left=0.2,right=0.8,top=0.8,centre=0.45,bottom=0.1)
# mar <- data.frame(vertical=0.08,horizontal=0.08,dist=0.04)
# 
# graphics::text(labels=paste("problem",1:2),x=c(pos$left,pos$right),y=pos$top+2*mar$vertical,font=2,col=c(blue,red),cex=cex)
# graphics::text(labels=expression(hat(beta)["j,1"]^{init}),x=pos$left,y=pos$top,col=blue)
# graphics::text(labels=expression(hat(beta)["j,2"]^{init}),x=pos$right,y=pos$top,col=red)
# 
# graphics::arrows(x0=rep(c(pos$left,pos$right),each=2),x1=rep(c(pos$left,pos$right),times=2)+c(-mar$horizontal,-mar$horizontal,mar$horizontal,mar$horizontal),y0=pos$top-mar$vertical,y1=pos$centre+mar$vertical,length=0.1,col=rep(c(blue,red),each=2),lwd=2)
# 
# graphics::text(labels=expression(w["j,1"]^{int}),x=pos$left-mar$horizontal-mar$dist,y=pos$centre,col=blue)
# graphics::text(labels=expression(w["p+j,1"]^{int}),x=pos$left-mar$horizontal+mar$dist,y=pos$centre,col=blue)
# graphics::text(labels=expression(w["j,1"]^{ext}),x=pos$left+mar$horizontal-mar$dist,y=pos$centre,col=red)
# graphics::text(labels=expression(w["p+j,1"]^{ext}),x=pos$left+mar$horizontal+mar$dist,y=pos$centre,col=red)
# 
# graphics::text(labels=expression(w["j,2"]^{ext}),x=pos$right-mar$horizontal-mar$dist,y=pos$centre,col=blue)
# graphics::text(labels=expression(w["p+j,2"]^{ext}),x=pos$right-mar$horizontal+mar$dist,y=pos$centre,col=blue)
# graphics::text(labels=expression(w["j,2"]^{int}),x=pos$right+mar$horizontal-mar$dist,y=pos$centre,col=red)
# graphics::text(labels=expression(w["p+j,2"]^{int}),x=pos$right+mar$horizontal+mar$dist,y=pos$centre,col=red)
# 
# graphics::arrows(x0=c(pos$left,pos$right),y0=pos$centre-mar$vertical,y1=pos$bottom+mar$vertical,col=c(blue,red),length=0.1,lwd=2)
# graphics::text(labels=expression(hat(beta)["j,1"]^{final}==hat(gamma)["j,1"]-hat(gamma)["p+j,1"]),x=pos$left,y=pos$bottom,col=blue)
# graphics::text(labels=expression(hat(beta)["j,2"]^{final}==hat(gamma)["j,2"]-hat(gamma)["p+j,2"]),x=pos$right,y=pos$bottom,col=red)
# 
# graphics::text(x=-0.1,y=c(pos$top,pos$bottom),labels=paste("stage",1:2),font=2,cex=cex)
# grDevices::dev.off()

## ----simulation,eval=sim.app--------------------------------------------------
# #<<setup>>
# 
# repetitions <- 10
# 
# for(mode in c("transfer","multiple")){
# 
#   grid <- expand.grid(prob.separate=c(0.0,0.025,0.05),prob.common=c(0.0,0.025,0.05),family="gaussian")
#   grid <- grid[rep(seq_len(nrow(grid)),each=repetitions),] #
#   grid$seed <- seq_len(nrow(grid))
#   grid$family <- as.character(grid$family)
#   deviance <- auc <- time <- mse.coef <- mse.zero <- mse.nzero <- sel.num <- sel.coef <- sel.count <- hyperpar <- list()
#   for(i in seq(from=1,to=nrow(grid))){
#     set.seed(seed=grid$seed[i])
#     cat("i=",i,"\n")
#     if(mode=="transfer"){
#       data <- sim_data_trans(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i])
#       method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet")
#     } else if(mode=="multiple"){
#       #--- multi-task learning ---
#       data <- sim_data_multi(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i])
#       method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls")
#     }
# 
#     result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid$family[i],method=method)
#     hyperpar[[i]] <- result$hyperpar
#     time[[i]] <- result$time
#     auc[[i]] <- result$auc
#     deviance[[i]] <- result$deviance
#     sel.num[[i]] <- t(sapply(result$coef,function(x) colSums(x!=0)))
#     sel.count[[i]] <- t(sapply(result$coef,function(x) rowMeans(count_matrix(truth=sign(data$beta),estim=sign(x))))) # Add na.rm=TRUE?
# 
#     sel.coef[[i]] <- t(sapply(result$coef,function(x) colMeans(sign(x)!=sign(data$beta))))
#     # CONTINUE HERE: consider sparsity, true positives, false negatives, signs
# 
#     mse.coef[[i]] <- t(sapply(result$coef,function(x) colMeans((data$beta-x)^2)))
#     mse.zero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta==0)*(data$beta-x))^2)))
#     mse.nzero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta!=0)*(data$beta-x))^2)))
#   }
#   save(grid,deviance,auc,sel.num,sel.count,sel.coef,mse.coef,mse.zero,mse.nzero,time,file=file.path(path,"results",paste0("simulation_",mode,".RData")))
# }
# 
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),      sessioninfo::session_info()),con=paste0(path,"/results/info_sim.txt"))

## ----simulation_figure,eval=fig.tab-------------------------------------------
# #<<setup>>
# 
# caption <- paste(c("\\textbf{Multi-task learning.}","\\textbf{Transfer learning.}"),"Comparison of different measures (rows) between an available method (red) and the proposed method (blue) in different simulation settings (columns), based on the average of three problems",c("(tasks)","(datasets)"),"for each repetition out of ten. Measures: performance metric (mean squared error on hold-out data, as a fraction of the one from standard lasso regression; a point below the dashed line means that",c("multi-task","transfer"),"learning improves predictions), sparsity (number of non-zero coefficients), precision (number of coefficients with correct signs divided by number of non-zero coefficients). The arrows point in the direction of improvement. Settings: percentage of features with a common effect for all problems ($\\pi_\\theta$), percentage of features with a specific effect for each problem ($\\pi_\\delta$).",c("\\label{fig_sim_multiple}","\\label{fig_sim_transfer}"))
# 
# figure_change <- function(model0,model1="sparselink",model2){
# 
#   mode <- paste0(100*grid$prob.common,"%\n",100*grid$prob.separate,"%")
# 
#   graphics::par(mfrow=c(3,1),mar=c(3,3,1,1))
# 
#   label <- function(){
#     cex <- 0.5
#     at <- 0.3
#     graphics::mtext(text=expression(pi[theta]==phantom(.)),side=1,line=0.2,at=at,cex=cex)
#     graphics::mtext(text=expression(pi[delta]==phantom(.)),side=1,line=1.2,at=at,cex=cex)
#   }
# 
#   #--- predictive performance ---
#   means <- t(sapply(X=deviance,FUN=rowMeans))
#   means <- means/means[,"wrap_separate"]
#   plot_change(x=mode,y0=means[,model0],y1=means[,model1],y2=means[,model2],main="metric",increase=FALSE)
#   graphics::abline(h=1,lty=2,col="grey")
#   label()
# 
#   #--- sparsity ---
#   nzero <- sapply(X=sel.num,FUN=rowMeans)
#   plot_change(x=mode,y0=nzero[model0,],y1=nzero[model1,],y2=nzero[model2,],main="sparsity",increase=FALSE)
# 
#   graphics::abline(h=0,lty=2,col="grey")
#   label()
# 
#   #--- precision ---
#   precision <- sapply(X=sel.count,FUN=function(x) x[,"precision"])
#   precision[is.na(precision)] <- 0
#   plot_change(x=mode,y0=precision[model0,],y1=precision[model1,],y2=precision[model2,],main="precision",increase=TRUE)
# 
#   graphics::abline(h=0,lty=2,col="grey")
#   label()
# 
# }
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_sim_multiple.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special")
# load(file.path(path,paste0("results/simulation_multiple.RData")),verbose=TRUE)
# #model.ref <- "wrap_mgaussian"
# #model.own <- "sparselink"
# figure_change(model0="wrap_mgaussian",model1="sparselink",model2="wrap_spls")
# rowMeans(sapply(deviance,function(x) rank(rowMeans(x))))
# rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",])))
# runtime <- rowSums(sapply(time,function(x) x))
# round(runtime/runtime["wrap_separate"],digits=2)
# grDevices::dev.off()
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_sim_transfer.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special")
# load(file.path(path,paste0("results/simulation_transfer.RData")))
# #model.ref <- "wrap_glmtrans"
# #model.own <- "sparselink"
# figure_change(model0="wrap_glmtrans",model1="sparselink",model2="wrap_xrnet")
# rowMeans(sapply(deviance,function(x) rank(rowMeans(x))))
# rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",])))
# runtime <- rowSums(sapply(time,function(x) x))
# round(runtime/runtime["wrap_separate"],digits=2)
# grDevices::dev.off()

## ----sim_extra,eval=sim.app---------------------------------------------------
# # Effect of sample size in source or target dataset (TL), effect of sample size (MTL).
# #<<setup>>
# 
# repetitions <- 50
# grid <- metric <- list()
# 
# for(mode in c("MTL-size","TL-source","TL-target")){ #,"TL-prop","MTL-prop"
#   metric[[mode]] <- list()
#   cand <- c(20,40,60,80,100)
#   if(mode=="MTL-size"){
#     grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n0=cand)
#   } else if(mode=="TL-source"){
#     grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=cand,n_target=50)
#   } else if(mode=="TL-target"){
#     grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=50,n_target=cand)
#   } else if(mode %in% c("TL-prop","MTL-prop")){
#     cand <- c(0.025,0.05,0.10,0.15,0.20)
#     grid[[mode]] <- expand.grid(prob.common=cand,prob.separate=NA,family="gaussian",n_source=50,n_target=50,n0=50)
#   } else {
#     stop("Wrong mode.")
#   }
#   grid[[mode]] <- grid[[mode]][rep(seq_len(nrow(grid[[mode]])),each=repetitions),]
#   #grid[[mode]]$seed <- seq_len(nrow(grid[[mode]]))
#   grid[[mode]]$seed <- rep(x=seq_len(repetitions),times=length(cand))
#   grid[[mode]]$family <- as.character(grid[[mode]]$family)
#   cond <- is.na(grid[[mode]]$prob.separate)
#   grid[[mode]]$prob.separate[cond] <- 0.5*grid[[mode]]$prob.common[cond]
# 
#   for(i in seq(from=1,to=nrow(grid[[mode]]))){
#       set.seed(seed=grid$seed[i])
#       cat("i=",i,"\n")
#       if(mode %in% c("TL-source","TL-target","TL-prop")){
#         n0 <- rep(c(grid[[mode]]$n_source[i],grid[[mode]]$n_target[i]),times=c(2,1))
#         data <- sim_data_trans(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=n0)
#         method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet")
#       } else if(mode %in% c("MTL-size","MTL-prop")){
#         data <- sim_data_multi(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=grid[[mode]]$n0[i])
#         method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls")
#       } else {
#         stop("Wrong mode.")
#       }
#       result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid[[mode]]$family[i],method=method)
#       metric[[mode]][[i]] <- result$deviance
#     }
# }
# 
# save(grid,metric,file=file.path(path,"results","simulation_devel.RData"))
# 
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),      sessioninfo::session_info()),con=paste0(path,"/results/info_sim_extra.txt"))

## ----sim_extra_figure,eval=fig.tab--------------------------------------------
# #<<setup>>
# load(file.path(path,"results","simulation_devel.RData"))
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_sim_extra.eps"),width=6.5,height=3,horizontal=FALSE,onefile=FALSE,paper="special")
# cex <- 0.8
# graphics::par(mfrow=c(1,3),mar=c(4.5,4.5,1.5,1),oma=c(0,0,0,0))
# #graphics::layout(mat=matrix(data=c(1,1,2,2,3,3,0,4,4,0,5,5),ncol=2))
# for(mode in c("MTL-size","TL-source","TL-target")){
# #for(mode in c("MTL-prop","TL-prop")){
#   if(mode %in% c("MTL-size","MTL-prop","TL-prop")){
#     mse <- sapply(metric[[mode]],function(x) rowMeans(x))
#   } else if(mode %in% c("TL-source","TL-target")){
#     mse <- sapply(metric[[mode]],function(x) x[,3])
#   }
#   if(mode %in% c("TL-source","TL-target","TL-prop")){
#     col <- c("wrap_separate"="black","wrap_glmtrans"="red","wrap_xrnet"="orange","sparselink"="blue")
#     lty <- c("wrap_separate"=3,"wrap_glmtrans"=2,"wrap_xrnet"=2,"sparselink"=1)
#   } else if(mode %in% c("MTL-size","MTL-prop")) {
#     col <- c("wrap_separate"="black","wrap_mgaussian"="red","wrap_spls"="orange","sparselink"="blue")
#     lty <- c("wrap_separate"=3,"wrap_mgaussian"=2,"wrap_spls"=2,"sparselink"=1)
#   }
#   if(mode=="TL-source"){
#     params <- grid[[mode]]$n_source
#   } else if(mode=="TL-target"){
#     params <- grid[[mode]]$n_target
#   } else if(mode=="MTL-size"){
#     params <- grid[[mode]]$n0
#   } else if(mode %in% c("TL-prop","MTL-prop")){
#     params <- grid[[mode]]$prob.common
#   }
#   unique <- unique(params)
#   graphics::plot.new()
#   graphics::plot.window(xlim=range(params),ylim=range(log(mse)))
#   graphics::box()
#   if(mode=="MTL-size"){
#     main <- "MTL - varying sample size"
#     xlab <- bquote("sample size ("~n[1]~"="~n[2]~"="~n[3]~")")
#     legend <- ""
#   } else if(mode=="TL-source"){
#     main <- "TL - varying source sample size"
#     xlab <- bquote("source sample size ("~n[1]~"="~n[2]~")")
#     legend <- bquote("target sample size:"~n[3]==.(unique(grid[[mode]]$n_target)))
#   } else if(mode=="TL-target"){
#     main <- "TL - varying target sample size"
#     xlab <- bquote("target sample size ("~n[3]~")")
#     legend <- bquote("source sample size:"~n[1]~"="~n[2]==.(unique(grid[[mode]]$n_source)))
#   } else if(mode=="MTL-prop"){
#     xlab <- "blabla"
#     main <- "MTL - effect proportion"
#     legend <- ""
#   } else if(mode=="TL-prop"){
#     xlab <- "blabla"
#     main <- "TL - effect proportion"
#     legend <- ""
#   }
#   graphics::title(main=main,cex.main=cex)
#   graphics::title(ylab="log MSE",line=2.5,xlab=xlab,cex.lab=cex)
#   graphics::legend(x="topleft",legend=legend,bty="n",cex=cex)
#   if(mode %in% c("TL-prop","MTL-prop")){
#     graphics::axis(side=1,at=unique,labels=paste0(100*unique,"%"),cex.axis=cex)
#   } else {
#     graphics::axis(side=1,at=unique,cex.axis=cex)
#   }
#   graphics::axis(side=2,cex.axis=cex)
# 
#   for(i in names(col)){
#     val <- tapply(X=mse[i,],INDEX=params,FUN=function(x) mean(x))
#     graphics::lines(x=unique,y=log(val),col=col[i],type="o",pch=16,lty=lty[i])
#   }
# }
# grDevices::dev.off()

## ----define_projects,eval=fig.tab---------------------------------------------
# project <- list()
# project$IBD <- c("Tew (2016)"="SRP063496",
#                  "Haberman (2019)"="SRP129004",
#                  "Verstockt (2019)"="ERP113396",
#                  "Verstockt (2020)"="ERP114636",
#                  "Boyd (2018)"="SRP100787")
# project$RA <- c("Baker (2019)"="SRP169062",
#                 "Moncrieffe (2017)"="SRP074736",
#                 "Goldberg (2018)"="SRP155483")
# extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091

## ----download_data,eval=sim.app-----------------------------------------------
# #<<setup>>
# #<<define_projects>>
# 
# data <- list()
# for(i in c(unlist(project),extra)){
#   data[[i]] <- recount3::create_rse_manual(
#     project=i,
#     project_home="data_sources/sra",
#     organism="human",
#     annotation = "gencode_v26",
#     type="gene")
# }
# save(data,file=file.path(path,"results/recount3_data.RData"))
# 
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
#       sessioninfo::session_info()),con=paste0(path,"/results/info_data.txt"))

## ----preprocess_data,eval=sim.app---------------------------------------------
# #<<setup>>
# #<<define_projects>>
# 
# load(file.path(path,"results/recount3_data.RData"))
# 
# #- - - - - - - - - - - - - - -
# #- - - extract features  - - -
# #- - - - - - - - - - - - - - -
# 
# # extract features
# x <- list()
# for(i in c(unlist(project),extra)){
#   counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
#   colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
#   x[[i]] <- counts
# }
# 
# # select most expressed protein-coding genes (for all TL projects together)
# select <- list()
# total <- numeric()
# for(i in unlist(project)){
#   #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
#   total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering
# }
# type <- SummarizedExperiment::rowData(data[[i]])$gene_type
# cond <- type=="protein_coding"
# total[,!cond] <- 0
# rank <- apply(X=total,MARGIN=1,FUN=rank)
# mean_rank <- rowMeans(rank)
# #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000
# temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000
# 
# for(i in unlist(project)){
#   select[[i]] <- temp
# }
# 
# # select most expressed protein-coding genes (for MTL project)
# #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original
# var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial
# #warning("change number in next line")
# #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000
# temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000
# select[[extra]] <- temp
# 
# # pre-processing
# for(i in c(unlist(project),extra)){
#   lib.size <- Matrix::rowSums(x[[i]])
#   x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
#   norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
#   gamma <- norm.factors*lib.size/mean(lib.size)
#   gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
#   x[[i]] <- x[[i]]/gamma
#   x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
#   x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
# }
# 
# #- - - - - - - - - - - - - -
# #- - - extract targets - - -
# #- - - - - - - - - - - - - -
# 
# # extract information on samples
# frame <- list()
# for(i in c(unlist(project),extra)){
#   list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
#   data[[i]]$sra.experiment_attributes
#   # What about sra.experiment_attributes?
#   n <- length(list)
#   cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
#   ncol <- length(cols)
#   frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
#   for(j in seq_len(n)){
#     for(k in seq_len(ncol)){
#       vector <- list[[j]]
#       which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
#       string <- vector[which]
#       if(length(string)==0){next}
#       frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
#     }
#   }
#   frame[[i]] <- as.data.frame(frame[[i]])
# }
# 
# # extract binary outcome
# y <- z <- list()
# for(i in unlist(project)){
#   # CONTINUE HERE!!!
#   if(i=="ERP113396"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
#   } else if(i=="ERP114636"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
#     warning("Inverting response and non-response!")
#   } else if(i=="SRP100787"){
#     y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
#   } else if(i=="SRP129004"){
#     y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
#     suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
#   } else if(i=="SRP063496"){
#     y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
#   } else if(i=="SRP169062"){
#     y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
#   } else if(i=="SRP155483"){
#     y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
#     z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
#   } else if(i=="SRP074736"){
#     y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
#   }
# }
# 
# # overlap
# for(j in unlist(project)){
#   is.na <- is.na(y[[j]])
#   if(length(is.na)!=nrow(x[[j]])){stop()}
#   y[[j]] <- y[[j]][!is.na]
#   if(!is.null(z[[j]])){
#     if(is.vector(z[[j]])){
#       z[[j]] <- z[[j]][!is.na]
#     } else {
#       z[[j]] <- z[[j]][!is.na,]
#     }
#   }
#   x[[j]] <- x[[j]][!is.na,]
# }

## ----explore_apply,eval=sim.app-----------------------------------------------
# #<<setup>>
# #<<define_projects>>
# #<<preprocess_data>>
# 
# set.seed(1)
# alpha.holdout <- 0
# alpha.crossval <- 1
# family <- "binomial"
# nfolds <- 10
# codes <- unlist(project)
# coef <- matrix(data=NA,nrow=ncol(x[[1]]),ncol=length(codes),dimnames=list(NULL,codes))
# auc <- auc.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes))
# foldid <- make_folds_trans(y=y,family="binomial",nfolds=nfolds)
# 
# ridge <- lasso <- list()
# for(i in seq_along(codes)){
#   ridge[[i]] <- glmnet::cv.glmnet(x=x[[codes[i]]],y=y[[codes[i]]],family=family,alpha=alpha.holdout,foldid=foldid[[i]])
#   coef[,i] <- stats::coef(ridge[[i]],s="lambda.min")[-1]
#   for(j in seq_along(codes)){
#     if(i==j){
#       y_hat <- rep(x=NA,times=length(y[[i]]))
#       for(k in seq_len(nfolds)){
#         holdout <- foldid[[i]]==k
#         temp <- glmnet::cv.glmnet(x=x[[codes[i]]][!holdout,],y=y[[codes[i]]][!holdout],family=family,alpha=alpha.crossval)
#         y_hat[holdout] <- predict(object=temp,newx=x[[codes[i]]][holdout,],s="lambda.min",type="response")
#       }
#     } else {
#       y_hat <- as.numeric(predict(object=ridge[[i]],newx=x[[j]],s="lambda.min",type="response"))
#     }
#     auc[i,j] <- pROC::auc(response=y[[codes[j]]],predictor=y_hat,direction="<",levels=c(0,1))
#     auc.pvalue[i,j] <- stats::wilcox.test(rank(y_hat)~y[[codes[[j]]]],alternative="less",exact=FALSE)$p.value
#   }
# }
# 
# save(coef,auc,auc.pvalue,codes,file=file.path(path,"results","explore_data.RData"))
# 
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
#       sessioninfo::session_info()),con=paste0(path,"/results/info_explore.txt"))

## ----explore_tables,eval=fig.tab,results="asis"-------------------------------
# #<<setup>>
# #if(any(unlist(project)!=names(refs))){stop("not compatible")}
# 
# load(file.path(path,"results/explore_data.RData"))
# names <- gsub(pattern="IBD.|RA.",replacement="",x=names(unlist(project)))
# codes <- colnames(coef)
# cor.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes))
# for(i in seq_along(codes)){
#   for(j in seq_along(codes)){
#     cor.pvalue[i,j] <- stats::cor.test(x=coef[,i],y=coef[,j],method="spearman",exact=FALSE)$p.value
#   }
# }
# diag(cor.pvalue) <- NA
# 
# insert.space <- function(table,cut){
#   index.left <- index.top <- seq_len(cut)
#   index.right <- index.bottom <- seq(from=cut+1,to=ncol(table))
#   top <- cbind(table[index.top,index.left],"",table[index.top,index.right])
#   bottom <- cbind(table[index.bottom,index.left],"",table[index.bottom,index.right])
#   out <- rbind(top,"",bottom)
#   colnames(out)[colnames(out)==""] <- " "
#   return(out)
# }
# 
# table <- stats::cor(coef,method="spearman")
# rownames(table) <- colnames(table) <- names
# black <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05)
# star <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05/choose(n=length(codes),k=2))
# nonnegative <- table>=0
# table <- format(round(table,digits=2),digits=2,trim=TRUE)
# table[nonnegative] <- paste0("\\phantom{-}",table[nonnegative])
# table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}")
# table[star] <- paste0(table[star],"$^\\star$")
# table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}")
# #table[nonnegative] <- paste0("-",table[nonnegative])
# diag(table) <- "-"
# table <- insert.space(table=table,cut=5)
# xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Spearman correlation coefficients between the ridge regression coefficients from different datasets. Pairwise combinations of datasets with significantly correlated regression coefficients are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/28$). We expect a correlation coefficient close to $0$ for unrelated problems and close to $1$ for identical problems.",label="tab_cor")
# xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_cor.tex"),floating.environment="table*") #add.to.row=list(pos=list(5),command="\\hdashline \n")
# 
# table <- auc
# rownames(table) <- colnames(table) <- names
# table <- format(round(table,digits=2),digits=2)
# black <- auc.pvalue<=0.05
# star <- auc.pvalue<=0.05/(length(codes)*length(codes))
# diag(table) <- paste0("(",diag(table),")")
# table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}")
# table[star] <- paste0(table[star],"$^\\star$")
# table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}")
# table <- insert.space(table=table,cut=5)
# xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Out-of-sample area under the receiver operating characteristic curve (\\textsc{roc-auc}) from logistic ridge regression trained on the dataset in the row and tested on the dataset in the column (off-diagonal entries), or cross-validated \\textsc{roc-auc} from logistic lasso regression trained and tested on the same dataset by $10$-fold external cross-validation (diagonal entries, between brackets). The \\textsc{roc-auc} of a random classifier is $0.5$, while that of a perfect classifier is $1.0$. Entries on and off the diagonal are not comparable. Predictions that are significantly better than random predictions (according to the one-sided Mann-Whitney $U$ test for testing whether the ranks of the predicted probabilities are significantly higher for the cases than for the controls) are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/64$).",label="tab_auc")
# xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_auc.tex"),floating.environment="table*")

## ----transfer_apply,eval=sim.app----------------------------------------------
# #<<setup>>
# #<<define_projects>>
# #<<preprocess_data>>
# 
# result <- list()
# for(i in names(project)){
#   cat("project:",i,"\n")
#   result[[i]] <- list()
#   for(j in seq_len(5)){ # 5 repetitions of 10-fold CV
#     set.seed(j)
#     codes <- project[[i]]
#     result[[i]][[j]] <- cv_transfer(y=y[codes],X=x[codes],family="binomial",method=c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet"),alpha.init=ifelse(i=="RA",0,0.95)) # lasso-like elastic net for IBD, ridge for RA (weak signal)
#   }
# }
# save(result,project,file=file.path(path,"results","application.RData"))
# 
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
#       sessioninfo::session_info()),con=paste0(path,"/results/info_app.txt"))
# 
# # debugging
# if(FALSE){
# set.seed(1)
# codes <- project[["RA"]]
# test <- wrap_xrnet(x=x[codes],y=y[codes],alpha.init=0.95,alpha=1)
# 
# }

## ----transfer_figure,eval=fig.tab---------------------------------------------
# #<<setup>>
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_app.eps"),width=6.5,height=4,horizontal=FALSE,onefile=FALSE,paper="special")
# graphics::par(mfrow=c(2,1),mar=c(4,2,1,1),oma=c(0,0,0,0))
# load(file.path(path,paste0("results/application.RData")),verbose=TRUE)
# 
# model0 <- "wrap_glmtrans"
# model1 <- "sparselink"
# model2 <- "wrap_xrnet"
# 
# # predictivity
# metric <- lapply(result,function(x) do.call(what="rbind",args=lapply(x,function(x) x$auc))) # DEV and AUC need different directions (increase=FALSE/TRUE)!
# metric <- do.call(what="rbind",args=metric)
# metric <- metric/metric[,"wrap_separate"]
# #xlab <- refs[rownames(metric)]
# #names <- gsub(pattern="",replacement="\n",x=unlist(project))
# 
# label <- gsub(pattern="IBD.|RA.",replacement="",x=gsub(pattern=" ",replacement="\n",x=names(unlist(project))))
# index <- match(x=rownames(metric),table=unlist(project))
# 
# xlab <- label[index]
# plot_change(x=xlab,y0=metric[,model0],y1=metric[,model1],y2=metric[,model2],main="metric",increase=TRUE,cex.main=0.8)
# graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2)
# graphics::abline(h=0.5,lty=2,col="grey")
# graphics::abline(h=1,lty=2,col="grey")
# 
# # sparsity
# nzero <- lapply(result,function(x) lapply(x,function(x) sapply(x$refit$coef,function(x) colSums(x!=0))))
# nzero <- do.call(what="rbind",args=do.call(what="c",args=nzero))
# plot_change(x=xlab,y0=nzero[,model0],y1=nzero[,model1],y2=nzero[,model2],main="sparsity",increase=FALSE,cex.main=0.8)
# graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2)
# graphics::abline(h=0,lty=2,col="grey")
# grDevices::dev.off()
# 
# # percentage change
# # (reported in section 4 "application" subsection 4.3 "transfer learning")
# 
# disease <- ifelse(rownames(metric) %in% project$IBD,"IBD",ifelse(rownames(metric) %in% project$RA,"RA",NA))
# 
# #round(100*colMeans(metric)-100,digits=2)
# round(100*colMeans(metric[disease=="IBD",])-100,digits=2)
# round(100*colMeans(metric[disease=="RA",])-100,digits=2)
# 
# colMeans(nzero)
# colMeans(nzero[disease=="IBD",])
# colMeans(nzero[disease=="RA",])
# 
# #--- revision: report AUC ---
# Reduce(f="+",x=lapply(result$IBD,function(x) x$auc))/length(result$IBD)
# lapply(result,function(x) round(colMeans(Reduce(f="+",x=lapply(x,function(x) x$auc))/length(result$IBD)),digits=2))

## ----interpret_models,eval=fig.tab--------------------------------------------
# #<<setup>>
# 
# load(file.path(path,"results","application.RData"))
# 
# coefs <- list()
# for(i in seq_along(result$IBD)){
#   coefs[[i]] <- result$IBD[[i]]$refit$coef$sparselink
#   colnames(coefs[[i]]) <- names(project$IBD)
#   rownames(coefs[[i]]) <- rownames(result$IBD[[1]]$refit$coef$wrap_glmtrans) # try to avoid this
# }
# 
# any <- rowSums(sapply(coefs,function(x) apply(x,1,function(x) any(x!=0))))!=0
# for(i in seq_along(result$IBD)){
#   coefs[[i]] <- coefs[[i]][any,]
# }
# table <- Reduce(f="+",x=coefs)/5
# 
# cex <- 0.7
# 
# grDevices::postscript(file=file.path(path,"manuscript","fig_coef.eps"),width=7,height=4,horizontal=FALSE,onefile=FALSE,paper="special")
# graphics::par(mfrow=c(1,1),mar=c(2.5,4.5,0.5,1.5),oma=c(0,0,0,0))
# graphics::plot.new()
# graphics::plot.window(xlim=c(0.6,ncol(table)+0.4),ylim=c(0.5,nrow(table)+0.5))
# col <- apply(table,1,function(x) ifelse(all(x<=0),"blue",ifelse(all(x>=0),"red","black")))
# colnames <- gsub(x=colnames(table),pattern=" ",replacement="\n")
# graphics::mtext(text=colnames,side=1,at=seq_len(ncol(table)),cex=cex,line=1)
# rownames <- rownames(table)
# graphics::mtext(text=rownames,side=2,at=seq_len(nrow(table)),las=2,cex=cex,line=0.7,col=col)
# star <- rowSums(table!=0)>1
# graphics::mtext(text="*",side=2,at=which(star),line=-0.3)
# graphics::mtext(text=ifelse(col=="blue","-",ifelse(col=="red","+",".")),side=4,at=seq_len(nrow(table)),las=2,cex=cex,line=0.5,col=col)
# for(i in seq_len(nrow(table))){
#   for(j in seq_len(ncol(table))){
#     for(k in 1:5){
#       col <- ifelse(coefs[[k]][i,j]<0,"blue",ifelse(coefs[[k]][i,j]>0,"red","white"))
#       cex <- pmax(sqrt(5*abs(coefs[[k]][i,j])),0.2)
#       graphics::points(x=j-(-3+k)*0.17,y=i,col=col,cex=cex,pch=16)
#     }
#   }
# }
# graphics::abline(v=seq(from=0.5,to=5.5,by=1))
# grDevices::dev.off()

## ----eval=FALSE---------------------------------------------------------------
# path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
# #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)
# 
# dir <- c("results","manuscript","package/R/functions.R")
# for(i in seq_along(dir)){
#   if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
#     stop(paste0("Require folder/file'",dir[i],"'."))
#   }
# }
# source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.
# 
# inst <- rownames(utils::installed.packages())
# pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     utils::install.packages(pkgs[i])
#   }
# }
# pkgs <- c("recount3","edgeR")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     BiocManager::install(pkgs[i])
#   }
# }
# 
# blue <- "blue"; red <- "red"
# 
# if(exists("sim.app")&exists("fig.tab")){
#   if(!sim.app&fig.tab){
#     files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
#     for(i in seq_along(files)){
#       if(!file.exists(file.path(path,"results",files[i]))){
#         stop("File",files[i],"is missing.")
#       }
#     }
#   }
# }
# project <- list()
# project$IBD <- c("Tew (2016)"="SRP063496",
#                  "Haberman (2019)"="SRP129004",
#                  "Verstockt (2019)"="ERP113396",
#                  "Verstockt (2020)"="ERP114636",
#                  "Boyd (2018)"="SRP100787")
# project$RA <- c("Baker (2019)"="SRP169062",
#                 "Moncrieffe (2017)"="SRP074736",
#                 "Goldberg (2018)"="SRP155483")
# extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091
# #<<setup>>
# #<<define_projects>>
# 
# load(file.path(path,"results/recount3_data.RData"))
# 
# #- - - - - - - - - - - - - - -
# #- - - extract features  - - -
# #- - - - - - - - - - - - - - -
# 
# # extract features
# x <- list()
# for(i in c(unlist(project),extra)){
#   counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
#   colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
#   x[[i]] <- counts
# }
# 
# # select most expressed protein-coding genes (for all TL projects together)
# select <- list()
# total <- numeric()
# for(i in unlist(project)){
#   #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
#   total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering
# }
# type <- SummarizedExperiment::rowData(data[[i]])$gene_type
# cond <- type=="protein_coding"
# total[,!cond] <- 0
# rank <- apply(X=total,MARGIN=1,FUN=rank)
# mean_rank <- rowMeans(rank)
# #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000
# temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000
# 
# for(i in unlist(project)){
#   select[[i]] <- temp
# }
# 
# # select most expressed protein-coding genes (for MTL project)
# #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original
# var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial
# #warning("change number in next line")
# #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000
# temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000
# select[[extra]] <- temp
# 
# # pre-processing
# for(i in c(unlist(project),extra)){
#   lib.size <- Matrix::rowSums(x[[i]])
#   x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
#   norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
#   gamma <- norm.factors*lib.size/mean(lib.size)
#   gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
#   x[[i]] <- x[[i]]/gamma
#   x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
#   x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
# }
# 
# #- - - - - - - - - - - - - -
# #- - - extract targets - - -
# #- - - - - - - - - - - - - -
# 
# # extract information on samples
# frame <- list()
# for(i in c(unlist(project),extra)){
#   list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
#   data[[i]]$sra.experiment_attributes
#   # What about sra.experiment_attributes?
#   n <- length(list)
#   cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
#   ncol <- length(cols)
#   frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
#   for(j in seq_len(n)){
#     for(k in seq_len(ncol)){
#       vector <- list[[j]]
#       which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
#       string <- vector[which]
#       if(length(string)==0){next}
#       frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
#     }
#   }
#   frame[[i]] <- as.data.frame(frame[[i]])
# }
# 
# # extract binary outcome
# y <- z <- list()
# for(i in unlist(project)){
#   # CONTINUE HERE!!!
#   if(i=="ERP113396"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
#   } else if(i=="ERP114636"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
#     warning("Inverting response and non-response!")
#   } else if(i=="SRP100787"){
#     y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
#   } else if(i=="SRP129004"){
#     y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
#     suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
#   } else if(i=="SRP063496"){
#     y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
#   } else if(i=="SRP169062"){
#     y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
#   } else if(i=="SRP155483"){
#     y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
#     z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
#   } else if(i=="SRP074736"){
#     y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
#   }
# }
# 
# # overlap
# for(j in unlist(project)){
#   is.na <- is.na(y[[j]])
#   if(length(is.na)!=nrow(x[[j]])){stop()}
#   y[[j]] <- y[[j]][!is.na]
#   if(!is.null(z[[j]])){
#     if(is.vector(z[[j]])){
#       z[[j]] <- z[[j]][!is.na]
#     } else {
#       z[[j]] <- z[[j]][!is.na,]
#     }
#   }
#   x[[j]] <- x[[j]][!is.na,]
# }
# source(file.path(path,"package/R/development.R"))
# 
# x <- x$SRP100787
# y <- y$SRP100787
# sd <- apply(x,2,sd) # temporary # REMOVE THIS!
# x <- x[,sd>=sort(sd,decreasing=TRUE)[100]] # REMOVE THIS!
# iter <- 10
# q <- 2
# n <- nrow(x)
# 
# #-- -- --- ---
# #--- MTL ---
# #-- -- --- ---
# 
# prob_mtl <- c(0.00,0.05,0.10,0.15,0.20)
# auc_mtl <- list()
# for(i in seq_along(prob_mtl)){
#   auc_mtl[[i]] <- list()
#   for(j in seq_len(iter)){
#     Y <- matrix(data=NA,nrow=n,ncol=q)
#     for(k in seq_len(q)){
#       Y[,k] <- abs(y - stats::rbinom(n=n,size=1,prob=prob_mtl[i]))
#     }
#     #auc_mtl[[i]][[j]] <- cv_multiple(y=Y,X=x,method=c("wrap_separate","sparselink"),family="binomial",alpha.init=0.95,alpha=1,type="exp",trial=FALSE)$auc
#     auc_mtl[[i]][[j]] <- joinet::cv.joinet(Y=Y,X=x,family="binomial")
#   }
# }
# # MTL is beneficial under prob=0.15
# 
# # for joinet (deviance: lower=better)
# a <- sapply(auc_mtl,function(x) rowMeans(sapply(x,function(x) rowMeans(x))))
# plot(prob_mtl,y=a["base",],type="o")
# lines(prob_mtl,y=a["meta",],type="o",lty=2)
# 
# 
# #lapply(auc_mtl,function(x) rowMeans(sapply(x,function(x) colMeans(x))))
# 
# #--- --- --- ---
# #--- TL ---
# #--- --- --- ---
# 
# prob_tl <- c(0.3,0.4,0.5,0.6,0.7)
# auc_tl <- list()
# for(i in seq_along(prob_tl)){
#   auc_tl[[i]] <- list()
#   cond <- rep(x=NA,times=n)
#   Y <- X <- list()
#   for(j in seq_len(iter)){
#     n0 <- round(prob_tl[i]*sum(y==0))
#     n1 <- round(prob_tl[i]*sum(y==1))
#     cond[y==0] <- sample(rep(x=c(0,1),times=c(n0,sum(y==0)-n0))) #sample(rep(x=c(0,1),length.out=sum(y==0)))
#     cond[y==1] <- sample(rep(x=c(0,1),times=c(n1,sum(y==1)-n1))) #sample(rep(x=c(0,1),length.out=sum(y==1)))
#     Y <- list(y1=y[cond==0],y2=y[cond==1])
#     X <- list(x1=x[cond==0,],x2=x[cond==1,])
#     auc_tl[[i]][[j]] <- cv_transfer(y=Y,X=X,method=c("wrap_separate","sparselink"),family="binomial",alpha.init=0.95,alpha=1,type="exp",trial=FALSE)$auc
#   }
# }
# 
# save(prob_mtl,auc_mtl,prob_tl,auc_tl,file=file.path(path,"results","sim-devel.RData"))
# 
# graphics::par(mfrow=c(1,2))
# 
# # MTL: AUC vs contamination
# aucs <- lapply(auc_mtl,function(x) sapply(x,function(x) colMeans(x)))
# auc <- sapply(aucs,function(x) rowMeans(x))
# graphics::plot(x=prob_mtl,auc["sparselink",],type="o",ylim=c(0.4,0.8),col="blue")
# graphics::lines(x=prob_mtl,auc["wrap_separate",],type="o",col="red")
# graphics::abline(h=0.5,lty=2,col="grey")
# 
# # TL: source/target sample size
# aucs <- lapply(auc_tl,function(x) sapply(x,function(x) x[2,]))
# auc <- sapply(aucs,function(x) rowMeans(x))
# graphics::plot(x=prob_tl,auc["sparselink",],type="o",ylim=c(0.4,0.8),col="blue")
# graphics::lines(x=prob_tl,auc["wrap_separate",],type="o",col="red")
# graphics::abline(h=0.5,lty=2,col="grey")
# 
# # statistical testing
# 
# 
# 

## ----multi-task,eval=FALSE----------------------------------------------------
# path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows)
# #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac)
# 
# dir <- c("results","manuscript","package/R/functions.R")
# for(i in seq_along(dir)){
#   if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){
#     stop(paste0("Require folder/file'",dir[i],"'."))
#   }
# }
# source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package.
# 
# inst <- rownames(utils::installed.packages())
# pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     utils::install.packages(pkgs[i])
#   }
# }
# pkgs <- c("recount3","edgeR")
# for(i in seq_along(pkgs)){
#   if(!pkgs[i]%in%inst){
#     BiocManager::install(pkgs[i])
#   }
# }
# 
# blue <- "blue"; red <- "red"
# 
# if(exists("sim.app")&exists("fig.tab")){
#   if(!sim.app&fig.tab){
#     files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData")
#     for(i in seq_along(files)){
#       if(!file.exists(file.path(path,"results",files[i]))){
#         stop("File",files[i],"is missing.")
#       }
#     }
#   }
# }
# project <- list()
# project$IBD <- c("Tew (2016)"="SRP063496",
#                  "Haberman (2019)"="SRP129004",
#                  "Verstockt (2019)"="ERP113396",
#                  "Verstockt (2020)"="ERP114636",
#                  "Boyd (2018)"="SRP100787")
# project$RA <- c("Baker (2019)"="SRP169062",
#                 "Moncrieffe (2017)"="SRP074736",
#                 "Goldberg (2018)"="SRP155483")
# extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091
# #<<setup>>
# #<<define_projects>>
# 
# load(file.path(path,"results/recount3_data.RData"))
# 
# #- - - - - - - - - - - - - - -
# #- - - extract features  - - -
# #- - - - - - - - - - - - - - -
# 
# # extract features
# x <- list()
# for(i in c(unlist(project),extra)){
#   counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts)
#   colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name
#   x[[i]] <- counts
# }
# 
# # select most expressed protein-coding genes (for all TL projects together)
# select <- list()
# total <- numeric()
# for(i in unlist(project)){
#   #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering
#   total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering
# }
# type <- SummarizedExperiment::rowData(data[[i]])$gene_type
# cond <- type=="protein_coding"
# total[,!cond] <- 0
# rank <- apply(X=total,MARGIN=1,FUN=rank)
# mean_rank <- rowMeans(rank)
# #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000
# temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000
# 
# for(i in unlist(project)){
#   select[[i]] <- temp
# }
# 
# # select most expressed protein-coding genes (for MTL project)
# #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original
# var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial
# #warning("change number in next line")
# #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000
# temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000
# select[[extra]] <- temp
# 
# # pre-processing
# for(i in c(unlist(project),extra)){
#   lib.size <- Matrix::rowSums(x[[i]])
#   x[[i]] <- x[[i]][,select[[i]],drop=FALSE]
#   norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size)
#   gamma <- norm.factors*lib.size/mean(lib.size)
#   gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]]))
#   x[[i]] <- x[[i]]/gamma
#   x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform
#   x[[i]] <- scale(x[[i]]) # scale because of different datasets!?
# }
# 
# #- - - - - - - - - - - - - -
# #- - - extract targets - - -
# #- - - - - - - - - - - - - -
# 
# # extract information on samples
# frame <- list()
# for(i in c(unlist(project),extra)){
#   list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|")
#   data[[i]]$sra.experiment_attributes
#   # What about sra.experiment_attributes?
#   n <- length(list)
#   cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1]))
#   ncol <- length(cols)
#   frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols))
#   for(j in seq_len(n)){
#     for(k in seq_len(ncol)){
#       vector <- list[[j]]
#       which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k])
#       string <- vector[which]
#       if(length(string)==0){next}
#       frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2]
#     }
#   }
#   frame[[i]] <- as.data.frame(frame[[i]])
# }
# 
# # extract binary outcome
# y <- z <- list()
# for(i in unlist(project)){
#   # CONTINUE HERE!!!
#   if(i=="ERP113396"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid")))
#   } else if(i=="ERP114636"){
#     y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid")))
#     warning("Inverting response and non-response!")
#   } else if(i=="SRP100787"){
#     y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid")))
#   } else if(i=="SRP129004"){
#     y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid")))
#     suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`)))
#   } else if(i=="SRP063496"){
#     y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid")))
#   } else if(i=="SRP169062"){
#     y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid")))
#   } else if(i=="SRP155483"){
#     y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid")))
#     z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid")))
#   } else if(i=="SRP074736"){
#     y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid")))
#   }
# }
# 
# # overlap
# for(j in unlist(project)){
#   is.na <- is.na(y[[j]])
#   if(length(is.na)!=nrow(x[[j]])){stop()}
#   y[[j]] <- y[[j]][!is.na]
#   if(!is.null(z[[j]])){
#     if(is.vector(z[[j]])){
#       z[[j]] <- z[[j]][!is.na]
#     } else {
#       z[[j]] <- z[[j]][!is.na,]
#     }
#   }
#   x[[j]] <- x[[j]][!is.na,]
# }
# source(file.path(path,"package/R/development.R"))
# 
# #- - - - - - - - - -
# #- - prepare data - -
# #- - - - - - - - - -
# 
# # explore SRP048801
# 
# project <- "ERP104864"
# if(project=="SRP155483"){
#   X <- x$SRP155483
#   vars <- c("disease activity","das score")
#   Y <- frame$SRP155483[,vars]
#   sapply(unique(Y$`disease activity`),function(x) range(as.numeric(Y$`das score`[Y$`disease activity`==x])))
#   warning("DAS score defines disease activity!")
#   problem <- list()
#   problem$unique <- colnames(Y)
# } else if(project=="SRP129004"){
#   X <- x$SRP129004
#   vars <- c("histology severity score","total mayo score")
#   Y <- frame$SRP129004[,vars]
#   names <- intersect(rownames(X),rownames(Y))
#   X <- X[names,]
#   Y <- Y[names,]
#   cond <- frame$SRP129004[names,"diagnosis"]=="Ulcerative Colitis"
#   X <- X[cond,]
#   Y <- Y[cond,]
#   problem <- list()
#   problem$unique <- colnames(Y)
# } else if(project=="ERP104864"){
#   X <- x$ERP104864
#   vars <- c("CCP","CRP","crp","DAS28","ESR","esr","HAQ","VAS","SWOLLEN","TENDER") # "inflammatory score" dropped due to NA # "RF" dropped because binary
#   Y <- frame$ERP104864[,vars]
#   pathotype <- frame$ERP104864$pathotype
#   if(any(rownames(X)!=rownames(Y))){stop()}
#   #Y <- Y[Y$pathotype=="lymphoid",vars]
#   if(any(!is.na(Y$CRP)&!is.na(Y$crp))){
#     stop("Invalid.")
#   } else {
#     Y$CRP[is.na(Y$CRP)] <- Y$crp[is.na(Y$CRP)]
#     Y$crp <- NULL
#   }
#   if(any(!is.na(Y$ESR)&!is.na(Y$esr))){
#     stop("Invalid.")
#   } else {
#     Y$ESR[is.na(Y$ESR)] <- Y$esr[is.na(Y$ESR)]
#     Y$esr <- NULL
#   }
#   problem <- list()
#   #problem$joint <- c("SWOLLEN","TENDER")
#   #problem$proms <- c("VAS","HAQ")
#   #problem$labor <- c("CRP","ESR") # CCP might be too different (check cor)
#   problem$trial <- c("CRP","ESR","SWOLLEN")
#   #problem$trial <- c("DAS28","SWOLLEN","TENDER")
#   #problem$all <- colnames(y)
# }
# 
# for(i in seq_len(ncol(Y))){
#   class(Y[[i]]) <- "numeric"
# }
# 
# cond <- apply(X=Y,MARGIN=1,FUN=function(x) all(!is.na(x))) #& pathotype=="lymphoid" # subtypes seem to be defined based on clinical scores
# #Y <- as.matrix(Y)[cond,] # temporary: binarisation
# y <- scale(as.matrix(Y)[cond,])
# x <- X[cond,]
# 
# #- - - - - - - - - - - - -
# #- - explore similarity - -
# #- - - - - - - - - - - - -
# 
# cor <- stats::cor(y,use="pairwise.complete.obs",method="spearman")
# round(x=cor,digits=2)
# graphics::image(t(cor)[,ncol(cor):1])
# #stats::heatmap(x=as.matrix(Y),Rowv=NA)
# d <- stats::dist(x=t(Y))
# hclust <- stats::hclust(d=d)
# graphics::plot(hclust)
# 
# graphics::par(mfrow=c(2,4),mar=c(2,2,1,1))
# for(i in seq_len(ncol(y))){
#     pvalue <- apply(x,2,function(x) cor.test(x,y[,i],method="spearman")$p.value)
#     graphics::hist(x=pvalue,main=colnames(y)[i])
# }
# 
# #--- relationship between cor coef and cor p-value ---
# coef <- stats::cor(y=y[,"CRP"],x=x,method="spearman")
# pval <- apply(X=x,MARGIN=2,FUN=function(x) stats::cor.test(x=x,y=y[,"CRP"],method="spearman")$p.value)
# graphics::plot(x=coef,y=sign(coef)*(-log10(pval))^0.5)
# #--- conclusion: equivalent ---
# 
# #--- examine correlation between ridge coefficients ---
# coef <- matrix(data=NA,nrow=ncol(x),ncol=ncol(y),dimnames=list(NULL,colnames(y)))
# for(i in seq_len(ncol(y))){
#   object <- glmnet::cv.glmnet(x=x,y=y[,i],family="gaussian",alpha=0)
#   coef[,i] <- stats::coef(object=object,s="lambda.min")[-1]
# }
# round(stats::cor(coef,method="spearman"),digits=2)
# #--- conclusion: some sets are strongly correlated ---
# 
# #cor(rowSums(scale(Y)),Y,method="spearman")
# 
# #- - - - - - - - - - - - -
# #- - cross-validation - -
# #- - - - - - - - - - - - -
# 
# alpha.init <- 0.95; alpha <- 1; type <- "exp"; trial <- FALSE
# #alpha.init <- NA; alpha <- 1; type <- "exp"; trial <- TRUE
# results <- list()
# for(k in seq_along(problem)){
#   results[[k]] <- list()
#   for(i in seq_len(3)){ # 5 repetitions of 10-fold CV
#     set.seed(i)
#     method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink","wrap_spls") #"sparselink" "group.devel"
#     results[[k]][[i]] <-  cv_multiple(y=y[,problem[[k]]],X=x,family="gaussian",method=method,nfolds=10,alpha=alpha,alpha.init=alpha.init,type=type,trial=trial)
#   }
# }
# 
# save(results,file=file.path(path,"results","app_mtl_devel.RData"))
# 
# lapply(results,function(x) lapply(x,function(x) colMeans(x$deviance)))
# 
# 
# colMeans(results[[1]][[1]]$deviance/results[[1]][[1]]$deviance[,"wrap_empty"])
# names(results) <- names(problem)
# 
# colMeans(results$trial[[1]]$deviance/results$trial[[1]]$deviance[,"wrap_empty"])
# 
# lapply(results,function(x) rowMeans(sapply(x,function(x) rowMeans(apply(x$deviance,1,rank)))))
# 
# object <- devel(y=y[,problem$trial],x=x,family="gaussian")
# 
# cm <- numeric()
# for(k in seq_along(problem)){
#   for(i in seq_len(1)){
#     dev <- results[[k]][[i]]$deviance
#     temp <- colMeans(dev/dev[,"wrap_separate"])
#     cm <- rbind(cm,temp)
#     #cat(cm,"\n")
#   }
# }
# colMeans(cm)
# 
# # separate, mgaussian, sparselink and devel have similar performance (i.e., MTL has no benefit with any of these three approaches), examine whether other approaches are better
# 
# # examine whether group lasso for TL/MTL performs better
# 
# # Consider using different candidate values, e.g., 0.01,0.5,1,1.5,2,10 also for sparselink. Removing 0 might be a good choice. (If 0 is not included, however, initial ridge regression might be better.)
# 
# #- - - - - - - - - - - -
# #- - learning curve - -
# #- - - - - - - - - - - -
# 
# size <- unique(c(seq(from=50,to=nrow(x),by=25),nrow(x)))
# iter <- 10 # increase to 10
# size <- nrow(x) # suppress learning curve
# 
# metric <- list()
# setting <- expand.grid(problem=names(problem),size=size,iter=seq_len(iter))
# for(i in seq_len(nrow(setting))){
#   cat(progress=paste0(100*i/nrow(setting),"%"),"\n")
#   cond <- sample(x=rep(x=c(FALSE,TRUE),times=c(nrow(x)-setting$size[i],setting$size[i])))
#   vars <- problem[[setting$problem[i]]]
#   metric[[i]] <- joinet:::cv.joinet(Y=y[cond,vars],X=x[cond,],family="gaussian",compare="mnorm")
#   #method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink")
#   #metric[[i]] <-  cv_multiple(y=y[cond,vars],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=FALSE)
# }
# frac <- sapply(X=metric,FUN=function(x) colMeans(t(x)/x["none",]))
# 
# #frac <- sapply(X=metric,FUN=function(x) colMeans(x$deviance/x$deviance[,"wrap_empty"]))
# 
# graphics::par(mfrow=c(1,length(problem)),mar=c(3,3,2,0.5))
# for(i in seq_along(problem)){
#   graphics::plot.new()
#   graphics::plot.window(xlim=range(size),ylim=range(frac))
#   graphics::box()
#   graphics::title(main=names(problem)[i])
#   graphics::axis(side=1)
#   graphics::axis(side=2)
#   graphics::abline(h=1,col="grey",lty=2)
#   cond <- setting$problem==names(problem)[i]
#   col <- c(base="red",mnorm="orange",meta="blue")
#   #col <- c(wrap_separate="red",sparselink="blue")
#   for(j in names(col)){
#     val <- tapply(X=frac[j,cond],INDEX=setting$size[cond],FUN=mean)
#     graphics::points(x=setting$size[cond],y=frac[j,cond],col=col[j])
#     graphics::lines(x=size,y=val,col=col[j],type="o",pch=16)
#   }
# }
# 
# #  JOINET is better than standard lasso when n>=100, performance is similar to spls and better than glmnet-mgaussian
# 
# #--- --- --- --- --- ---
# # binarisation ---
# #--- --- --- --- --- ---
# 
# 
# yy <- 1*cbind(y[,"CCP"]>20,y[,"CRP"]>10,y[,"DAS28"]>5)
# metric <- list()
# for(i in 1:10){
#   metric[[i]] <- joinet:::cv.joinet(Y=yy,X=x,family="binomial")
# }
# 
# #rowMeans(sapply(metric,function(x) rowMeans(x)))
# 
# #--- --- --- --- --- --- ---
# #--- explore other datasets ---
# #--- --- --- --- --- --- ---
# 
# #projects <- recount3::available_projects(organism="human")
# #recount3::read_metadata()
# 
# 
# #data <- xlsx::read.xlsx(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.xlsx"),sheetIndex=1)
# data <- read.csv(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.csv"))
# 
# # baseline data (predictors)
# x <- data[data$COHORT==1 & data$EVENT_ID=="BL",]
# rownames(x) <- x$PATNO
# prop.miss <- colMeans(is.na(x))
# x <- x[,prop.miss<=0.5 & sapply(x,class) %in% c("numeric","integer")]
# x <- x[,which(colnames(x)=="age"):ncol(x)]
# x <- missRanger::missRanger(data=x,pmm.k=3,num.trees=100,verbose=0,seed=1)
# x <- scale(x)
# 
# # follow-up data (outcomes)
# visit <- c("V04","V06","V08")
# y <- data[data$COHORT==1 & data$EVENT_ID %in% visit,]
# colnames(y)[colnames(y)=="updrs_totscore"] <- "updrs"
# vars <- c("moca","quip","updrs","gds","scopa","ess","bjlot","rem")
# y <- y[,c("EVENT_ID","PATNO",vars)]
# y <- reshape(data=y,idvar="PATNO",timevar="EVENT_ID",direction="wide")
# rownames(y) <- y$PATNO; y$PATNO <- NULL
# 
# # overlap
# names <- intersect(rownames(x),rownames(y))
# x <- x[names,]
# y <- y[names,]
# y <- as.matrix(y)
# #y <- scale(y)
# 
# metric <- list()
# for(i in seq_along(vars)){
#   cat(vars[i],"\n")
#   cols <- paste0(vars[i],".",visit)
#   cond <- rowSums(is.na(y[,cols]))==0 # temporary
#   if(sum(cond)>250){cond[cumsum(cond)>250] <- FALSE} # temporary
#   metric[[i]] <- joinet:::cv.joinet(Y=y[cond,cols],X=x[cond,],family="gaussian",compare="mnorm")
#   #method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink","devel") # ,"sparselink")
#   #metric[[i]] <- cv_multiple(y=y[cond,cols],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=FALSE)
# }
# 
# #rowMeans(sapply(metric,function(x) colMeans(t(x)/x["none",])))
# rowMeans(sapply(metric,function(x) colMeans(x$deviance/x$deviance[,"wrap_empty"])))
# 

## ----session_info,echo=FALSE,eval=fig.tab-------------------------------------
# writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),      sessioninfo::session_info()),con=paste0(path,"/results/info_figtab.txt"))