### R code from vignette source 'Canopy_vignettes.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: Installation1 (eval = FALSE)
###################################################
## install.packages('Canopy')  # updated every 3-4 months


###################################################
### code chunk number 2: Installation2 (eval = FALSE)
###################################################
## install.packages("devtools")
## library(devtools)
## install_github("yuchaojiang/Canopy/package")  # STRONGLY recommended


###################################################
### code chunk number 3: Input
###################################################
library(Canopy)
data("MDA231")

projectname = MDA231$projectname ## name of project
R = MDA231$R; R ## mutant allele read depth (for SNAs)
X = MDA231$X; X ## total depth (for SNAs)
WM = MDA231$WM; WM ## observed major copy number (for CNA regions)
Wm = MDA231$Wm; Wm ## observed minor copy number (for CNA regions)
epsilonM = MDA231$epsilonM ## standard deviation of WM, pre-fixed here
epsilonm = MDA231$epsilonm ## standard deviation of Wm, pre-fixed here
## Matrix C specifices whether CNA regions harbor specific CNAs 
## only needed if overlapping CNAs are observed, specifying which CNAs overlap
C = MDA231$C; C
Y = MDA231$Y; Y ## whether SNAs are affected by CNAs


###################################################
### code chunk number 4: SNA_clustering
###################################################
library(Canopy)
data(toy3)
R=toy3$R; X=toy3$X # 200 mutations across 3 samples
num_cluster=2:9 # Range of number of clusters to run
num_run=10 # How many EM runs per clustering step for each mutation cluster wave
canopy.cluster=canopy.cluster(R = R,
                              X = X,
                              num_cluster = num_cluster,
                              num_run = num_run)
bic_output=canopy.cluster$bic_output # BIC for model selection (# of clusters)
Mu=canopy.cluster$Mu # VAF centroid for each cluster
Tau=canopy.cluster$Tau  # Prior for mutation cluster, with a K+1 component
sna_cluster=canopy.cluster$sna_cluster # cluster identity for each mutation


###################################################
### code chunk number 5: fig1
###################################################
par(mfrow=c(1,2))
plot(num_cluster,bic_output,xlab='Number of mutation clusters',ylab='BIC',type='b',main='BIC for model selection')
abline(v=num_cluster[which.max(bic_output)],lty=2)
colc=c('green4','red3','royalblue1','darkorange1','royalblue4',
       'mediumvioletred','seagreen4','olivedrab4','steelblue4','lavenderblush4')
pchc=c(17,0,1,15,3,16,4,8,2,16)
library(scatterplot3d)
scatterplot3d((R/X)[,1],(R/X)[,2],(R/X)[,3],xlim=c(0,max(R/X)),ylim=c(0,max(R/X)),zlim=c(0,max(R/X)),color=colc[sna_cluster],pch=pchc[sna_cluster],
              xlab='Sample1 VAF',ylab='Sample2 VAF',zlab='Sample3 VAF',
              main='VAF clustering across 3 samples')
par(mfrow=c(1,1))


###################################################
### code chunk number 6: SNA_clustering_AML43
###################################################
library(Canopy)
data(AML43)
R=AML43$R; X=AML43$X
num_cluster=4 # Range of number of clusters to run
num_run=6 # How many EM runs per clustering step for each mutation cluster wave
Tau_Kplus1=0.05 # Pre-specified proportion of noise component
Mu.init=cbind(c(0.01,0.15,0.25,0.45),
              c(0.2,0.2,0.01,0.2)) # Initial value for centroid
canopy.cluster=canopy.cluster(R = R,
                              X = X,
                              num_cluster = num_cluster,
                              num_run = num_run,
                              Mu.init = Mu.init,
                              Tau_Kplus1=Tau_Kplus1)
Mu=canopy.cluster$Mu # VAF centroid for each cluster
Tau=canopy.cluster$Tau  # Prior for mutation cluster, with a K+1 component
sna_cluster=canopy.cluster$sna_cluster # cluster identity for each mutation

R.qc=R[sna_cluster<=4,] # exclude mutations in the noise cluster
X.qc=X[sna_cluster<=4,]
sna_cluster.qc=sna_cluster[sna_cluster<=4]

R.cluster=round(Mu*100)  # Generate pseudo-SNAs correponding to each cluster. 
X.cluster=pmax(R.cluster,100)   # Total depth is set at 100 but can be obtained as median instead 
rownames(R.cluster)=rownames(X.cluster)=paste('SNA.cluster',1:4,sep='')


###################################################
### code chunk number 7: fig2
###################################################
Mu=canopy.cluster$Mu # VAF centroid for each cluster
Tau=canopy.cluster$Tau  # Prior for mutation cluster, with a K+1 component
sna_cluster=canopy.cluster$sna_cluster # cluster identity for each mutation
colc=c('green4','red3','royalblue1','darkorange1','royalblue4',
       'mediumvioletred','seagreen4','olivedrab4','steelblue4','lavenderblush4')
pchc=c(17,0,1,15,3,16,4,8,2,16)
plot((R/X)[,1],(R/X)[,2],xlab='Sample1 VAF',ylab='Sample2 VAF',col=colc[sna_cluster],pch=pchc[sna_cluster],ylim=c(0,max(R/X)),xlim=c(0,max(R/X)))


###################################################
### code chunk number 8: Tree_elements1
###################################################
data('MDA231_tree')
MDA231_tree$Z # Z matrix specifies the position of the SNAs along the tree branch
MDA231_tree$cna.copy # major and minor copy number (interger values) for each CNA
MDA231_tree$CM # Major copy per clone for each CNA
MDA231_tree$Cm # Minor copy per clone for each CNA
MDA231_tree$Q # whether an SNA precedes a CNA


###################################################
### code chunk number 9: Tree_elements2
###################################################
MDA231_tree$H # if an SNA precedes a CNA, whether it resides in the major copy
MDA231_tree$P # clonal compostion for each sample
MDA231_tree$VAF # VAF based on current tree structure


###################################################
### code chunk number 10: Sampling1 (eval = FALSE)
###################################################
## K = 3:6 # number of subclones
## numchain = 20 # number of chains with random initiations
## sampchain = canopy.sample(R = R, X = X, WM = WM, Wm = Wm, epsilonM = epsilonM, 
##             epsilonm = epsilonm, C = C, Y = Y, K = K, numchain = numchain, 
##             max.simrun = 50000, min.simrun = 10000, 
##             writeskip = 200, projectname = projectname, cell.line = TRUE, 
##             plot.likelihood = TRUE)
## save.image(file = paste(projectname, '_postmcmc_image.rda',sep=''),
##            compress = 'xz')


###################################################
### code chunk number 11: Sampling2
###################################################
data("MDA231_sampchain")
sampchain = MDA231_sampchain
k = 3; K = 3:6
sampchaink = MDA231_sampchain[[which(K==k)]]


###################################################
### code chunk number 12: Sampling3
###################################################
length(sampchain) ## number of subtree spaces (K=3:6)
length(sampchain[[which(K==4)]]) ## number of chains for subtree space with 4 subclones
length(sampchain[[which(K==4)]][[1]]) ## number of posterior trees in each chain


###################################################
### code chunk number 13: BIC
###################################################
burnin = 100
thin = 5 # If there is error in the bic and canopy.post step below, make sure
         # burnin and thinning parameters are wisely selected so that there are
         # posterior trees left.
bic = canopy.BIC(sampchain = sampchain, projectname = projectname, K = K,
               numchain = numchain, burnin = burnin, thin = thin, pdf = FALSE)
optK = K[which.max(bic)]


###################################################
### code chunk number 14: fig3
###################################################
# Note: this segment is soley for generating BIC plot in the vignettes.
# Use Canopy.BIC() with pdf = TRUE to generate this plot directly.
par(mfrow=c(1,2))
projectname = 'MDA231'
numchain = 20
clikelihood = matrix(nrow = numchain, ncol = length(sampchaink[[1]]), data = NA)
for(numi in 1:numchain){
  for(i in 1:ncol(clikelihood)){
    clikelihood[numi,i] = sampchaink[[numi]][[i]]$likelihood
  }
}
plot(1:ncol(clikelihood), clikelihood[1,], type='l', xlab = 'Iteration',
     ylab = 'Log-likelihood', col = 1, ylim = c(min(clikelihood), 
                                                max(clikelihood)))
for(numi in 2:numchain){
  points(1:ncol(clikelihood), clikelihood[numi,], type = 'l', col = numi)
}
title(main=paste('Posterior likelihood', k, 'clones', numchain,
            'chains'),cex=0.6)
plot(K, bic, xlab = 'Number of subclones', ylab = 'BIC', type = 'b', xaxt = "n")
axis(1, at = K)
abline(v = (3:6)[which.max(bic)], lty = 2)
title('BIC for model selection')


###################################################
### code chunk number 15: Post
###################################################
post = canopy.post(sampchain = sampchain, projectname = projectname, K = K,
                 numchain = numchain, burnin = burnin, thin = thin, optK = optK,
                 C = C, post.config.cutoff = 0.05)
samptreethin = post[[1]]   # list of all post-burnin and thinning trees
samptreethin.lik = post[[2]]   # likelihoods of trees in samptree
config = post[[3]] # configuration for each posterior tree
config.summary = post[[4]] # configuration summary
print(config.summary)
# first column: tree configuration
# second column: posterior configuration probability in the entire tree space
# third column: posterior configuration likelihood in the subtree space


###################################################
### code chunk number 16: Plot
###################################################
config.i = config.summary[which.max(config.summary[,3]),1]
cat('Configuration', config.i, 'has the highest posterior likelihood!\n')
# plot the most likely tree in the posterior tree space
output.tree = canopy.output(post, config.i, C)
canopy.plottree(output.tree)

# plot the tree with configuration 1 in the posterior tree space
output.tree = canopy.output(post, 1, C)
canopy.plottree(output.tree, pdf=TRUE, pdf.name = 
                    paste(projectname,'_first_config.pdf',sep=''))


###################################################
### code chunk number 17: fig4
###################################################
canopy.plottree(output.tree)


###################################################
### code chunk number 18: Try it your self (eval = FALSE)
###################################################
## library(Canopy)
## data(toy)
## projectname = 'toy'
## R = toy$R; X = toy$X; WM = toy$WM; Wm = toy$Wm
## epsilonM = toy$epsilonM; epsilonm = toy$epsilonm; Y = toy$Y
## 
## K = 3:6; numchain = 10
## sampchain = canopy.sample(R = R, X = X, WM = WM, Wm = Wm, epsilonM = epsilonM, 
##                           epsilonm = epsilonm, C = NULL, Y = Y, K = K, 
##                           numchain = numchain, simrun = 50000, writeskip = 200,
##                           projectname = projectname, cell.line = FALSE,
##                           plot.likelihood = TRUE)


###################################################
### code chunk number 19: fig5
###################################################
data(toy)
canopy.plottree(toy$besttree, txt = FALSE, pdf = FALSE)


###################################################
### code chunk number 20: Try it your self2 (eval = FALSE)
###################################################
## library(Canopy)
## data(toy2)
## projectname = 'toy2'
## R = toy2$R; X = toy2$X; WM = toy2$WM; Wm = toy2$Wm
## epsilonM = toy2$epsilonM; epsilonm = toy2$epsilonm; Y = toy2$Y
## true.tree = toy2$true.tree  # true underlying tree
## K = 3:6; numchain = 15
## sampchain = canopy.sample(R = R, X = X, WM = WM, Wm = Wm, epsilonM = epsilonM, 
##                           epsilonm = epsilonm, C = NULL, Y = Y, K = K, 
##                           numchain = numchain, max.simrun = 100000,
##                           min.simrun = 10000, writeskip = 200,
##                           projectname = projectname, cell.line = FALSE,
##                           plot.likelihood = TRUE)


###################################################
### code chunk number 21: fig6
###################################################
data(toy2)
canopy.plottree(toy2$true.tree, txt = FALSE, pdf = FALSE)


###################################################
### code chunk number 22: Try it your self3 (eval = FALSE)
###################################################
## library(Canopy)
## data(toy3)
## R=toy3$R; X=toy3$X
## num_cluster=2:9 # Range of number of clusters to run
## num_run=10 # How many EM runs per clustering step for each mutation cluster wave
## canopy.cluster=canopy.cluster(R = R,
##                               X = X,
##                               num_cluster = num_cluster,
##                               num_run = num_run)
## 
## bic_output=canopy.cluster$bic_output # BIC for model selection (# of clusters)
## Mu=canopy.cluster$Mu # VAF centroid for each cluster
## Tau=canopy.cluster$Tau  # Prior for mutation cluster, with a K+1 component
## sna_cluster=canopy.cluster$sna_cluster # cluster identity for each mutation
## 
## projectname='toy3'
## K = 3:5 # number of subclones
## numchain = 15 # number of chains with random initiations
## sampchain = canopy.sample.cluster.nocna(R = R, X = X, sna_cluster = sna_cluster,
##                                         K = K, numchain = numchain, 
##                                         max.simrun = 100000, min.simrun = 20000,
##                                         writeskip = 200, projectname = projectname,
##                                         cell.line = FALSE, plot.likelihood = TRUE)
## save.image(file = paste(projectname, '_postmcmc_image.rda',sep=''),
##            compress = 'xz')


###################################################
### code chunk number 23: sessionInfo
###################################################
toLatex(sessionInfo())