## ----options,include = FALSE-------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE,comment = "#>", echo = TRUE,cache = TRUE, dev = "png") # Fix seed set.seed(12) ## ----setup,warning = FALSE---------------------------------------------------- req_packs = c("devtools","ggplot2","smarter","SMASH") for(pack in req_packs){ library(package = pack,character.only = TRUE) } # List package's exported functions ls("package:SMASH") ## ----------------------------------------------------------------------------- eS[[3]][[1]] ## ----------------------------------------------------------------------------- eS[[3]][[2]] ## ----echo = FALSE------------------------------------------------------------- maxLOCI = 50 meanDP = 1e3 nCN = 3 ## ----sim---------------------------------------------------------------------- sim = gen_subj_truth( mat_eS = eS[[3]][[2]], maxLOCI = maxLOCI, nCN = nCN) class(sim) names(sim) ## ----------------------------------------------------------------------------- # Underlying true allocation, multiplicity, and cellular prevalence per point mutation dim(sim$subj_truth) sim$subj_truth[1:5,] # Combinations of allelic copy number, allocation, and multiplicity # with resulting mean VAFs (written as MAF) and cellular prevalences uniq_states = unique(sim$subj_truth[, c("CN_1","CN_2","true_A","true_M","true_MAF","true_CP")]) rownames(uniq_states) = NULL round(uniq_states,3) # Tumor purity sim$purity # Tumor subclone proportions sim$eta # Cancer subclone proportions sim$eta / sim$purity sim$q ## ----------------------------------------------------------------------------- mat_RD = gen_ITH_RD(DATA = sim$subj_truth,RD = meanDP) dim(mat_RD); mat_RD[1:5,] smart_hist(rowSums(mat_RD),breaks = 20, xlab = "Total Depth",cex.lab = 1.5) # Combine copy number and read count information input = cbind(sim$subj_truth,mat_RD) # Calculate observed VAF input$VAF = input$tAD / rowSums(input[,c("tAD","tRD")]) # Remove rows with no alternate depth input = input[which(input$tAD > 0),] dim(input) input[1:3,] ## ----------------------------------------------------------------------------- # All loci smart_hist(input$VAF,breaks = 30,main = "All Loci", xlab = "Observed VAF",cex.lab = 1.5) abline(v = unique(input$true_MAF),lty = 2,lwd = 2,col = "magenta") # Loci per copy number state uCN = unique(input[,c("CN_1","CN_2")]) tmp_range = range(input$VAF) for(ii in seq(nrow(uCN))){ # ii = 3 idx = which(input$CN_1 == uCN$CN_1[ii] & input$CN_2 == uCN$CN_2[ii]) smart_hist(input$VAF[idx],breaks = 20,xlim = tmp_range, main = sprintf("Loci with (CN_1,CN_2) = (%s,%s)",uCN$CN_1[ii],uCN$CN_2[ii]), xlab = "Observed VAF",cex.lab = 1.5) abline(v = unique(input$true_MAF[idx]),lty = 2,lwd = 2,col = "magenta") } ## ----know_config,fig.dim = c(5,5)--------------------------------------------- # Calculate true_unc_q, the unconstrained subclone proportions in cancer true_unc_q = log(sim$q[-length(sim$q)] / sim$q[length(sim$q)]) true_unc_q # Optimize ith_out = ITH_optim(my_data = input, my_purity = sim$purity, pi_eps0 = 0, my_unc_q = true_unc_q, init_eS = eS[[3]][[2]]) # Estimate of unclassified loci ith_out$pi_eps # Model fit BIC ith_out$BIC # Estimate of subclone proportions in cancer nSC = length(ith_out$q) tmp_df = smart_df(SC = as.character(rep(seq(nSC),2)), CLASS = c(rep("Truth",nSC),rep("Estimate",nSC)), VALUE = c(sim$q,ith_out$q)) tmp_df ggplot(data = tmp_df,mapping = aes(x = SC,y = VALUE,fill = CLASS)) + geom_bar(width = 0.5,stat = "identity",position = position_dodge()) + xlab("Subclone") + ylab("Proportion in Cancer") + theme(legend.position = "bottom", text = element_text(size = 20)) # Compare truth vs estimated/inferred ## Variant Allele Frequency smoothScatter(input$true_MAF,ith_out$infer$infer_MAF, main = "VAF",xlab = "Truth",ylab = "Inferred",cex.lab = 1.5) abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red") smoothScatter(input$VAF,ith_out$infer$infer_MAF, main = "VAF",xlab = "Observed",ylab = "Inferred",cex.lab = 1.5) abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red") ## Cellular prevalence smoothScatter(input$true_CP, ith_out$infer$infer_CP,main = "Cellular Prevalence", xlim = c(0,1),ylim = c(0,1), xlab = "Truth",ylab = "Estimate",cex.lab = 1.5) abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red") ## Allocation smoothScatter(input$true_A,ith_out$infer$infer_A, main = "Allocation",xlab = "Truth",ylab = "Inferred", cex.lab = 1.5) abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red") if( any(is.na(ith_out$infer$infer_A)) ) cat("Some loci couldn't classify\n") ## Multiplicity smart_table(Truth = input$true_M, Inferred = ith_out$infer$infer_M) ## ----unknown_opt,R.options = list(width = 200),fig.dim = c(8,6)--------------------------------------------------------------------------------------------------------------------------------------- grid_ith = grid_ITH_optim( my_data = input, my_purity = sim$purity, list_eS = eS, trials = 50, max_iter = 4e3) names(grid_ith) # Grid of solutions grid_ith$GRID # Order solution(s) based on BIC (larger BIC correspond to better fits) gg = grid_ith$GRID gg = gg[order(-gg$BIC),] head(gg) # true cancer proportions sim$q # true entropy -sum(sim$q * log(sim$q)) ## ----post,fig.dim = c(8,5)---------------------------------------------------- pp = vis_GRID(GRID = grid_ith$GRID) print(pp) ## ----------------------------------------------------------------------------- gg = grid_ith$GRID idx = which(gg$BIC == max(gg$BIC)) gg[idx,] gg$cc[idx][1] ## ----------------------------------------------------------------------------- opt_entropy = gg$entropy[idx] names(opt_entropy) = sprintf("Solution %s",idx) opt_entropy ## ----------------------------------------------------------------------------- props = list() for(jj in seq(length(idx))){ opt_prop = gg$q[idx[jj]] opt_prop = as.numeric(strsplit(opt_prop,",")[[1]]) names(opt_prop) = sprintf("SC%s",seq(length(opt_prop))) props[[sprintf("Solution %s",idx[jj])]] = opt_prop } props ## ----------------------------------------------------------------------------- for(jj in seq(length(idx))){ cat(sprintf("Solution %s\n",idx[jj])) print(head(grid_ith$INFER[[idx[jj]]])) } ## ----------------------------------------------------------------------------- opt_cps = list() for(jj in seq(length(idx))){ # jj = 1 tab = table(smart_digits(grid_ith$INFER[[idx[jj]]]$infer_CP,4), grid_ith$INFER[[idx[jj]]]$infer_A) rownames(tab) = sprintf("CP = %s",rownames(tab)) colnames(tab) = sprintf("ALLOC = %s",colnames(tab)) opt_cps[[sprintf("Solution %s",idx[jj])]] = tab } opt_cps ## ----------------------------------------------------------------------------- opt = list() for(jj in seq(length(idx))){ # jj = 1 opt_cc = gg$cc[idx[jj]] opt_kk = gg$kk[idx[jj]] mat = eS[[opt_cc]][[opt_kk]] dimnames(mat) = list(sprintf("ALLOC = %s",seq(opt_cc)),sprintf("SC%s",seq(opt_cc))) opt[[sprintf("Solution %s",idx[jj])]] = mat } opt ## ----sessInfo----------------------------------------------------------------- sessionInfo()