## ----options,include = FALSE--------------------------------------------------
knitr::opts_chunk$set(
	collapse = TRUE,comment = "#>",
	echo = TRUE,cache = TRUE,
	dev = "png")

# Fix seed
set.seed(12)

## ----setup,warning = FALSE----------------------------------------------------
req_packs = c("devtools","ggplot2","smarter","SMASH")
for(pack in req_packs){
  library(package = pack,character.only = TRUE)
}

# List package's exported functions
ls("package:SMASH")

## -----------------------------------------------------------------------------
eS[[3]][[1]]

## -----------------------------------------------------------------------------
eS[[3]][[2]]

## ----echo = FALSE-------------------------------------------------------------
maxLOCI = 50
meanDP = 1e3
nCN    = 3

## ----sim----------------------------------------------------------------------
sim = gen_subj_truth(
  mat_eS = eS[[3]][[2]],
  maxLOCI = maxLOCI,
  nCN = nCN)
class(sim)
names(sim)

## -----------------------------------------------------------------------------
# Underlying true allocation, multiplicity, and cellular prevalence per point mutation
dim(sim$subj_truth)
sim$subj_truth[1:5,]

# Combinations of allelic copy number, allocation, and multiplicity 
#   with resulting mean VAFs (written as MAF) and cellular prevalences
uniq_states = unique(sim$subj_truth[,
  c("CN_1","CN_2","true_A","true_M","true_MAF","true_CP")])
rownames(uniq_states) = NULL
round(uniq_states,3)

# Tumor purity
sim$purity

# Tumor subclone proportions
sim$eta

# Cancer subclone proportions
sim$eta / sim$purity
sim$q

## -----------------------------------------------------------------------------
mat_RD = gen_ITH_RD(DATA = sim$subj_truth,RD = meanDP)
dim(mat_RD); mat_RD[1:5,]
smart_hist(rowSums(mat_RD),breaks = 20,
  xlab = "Total Depth",cex.lab = 1.5)

# Combine copy number and read count information
input = cbind(sim$subj_truth,mat_RD)

# Calculate observed VAF
input$VAF = input$tAD / rowSums(input[,c("tAD","tRD")])

# Remove rows with no alternate depth
input = input[which(input$tAD > 0),]
dim(input)
input[1:3,]

## -----------------------------------------------------------------------------
# All loci
smart_hist(input$VAF,breaks = 30,main = "All Loci",
  xlab = "Observed VAF",cex.lab = 1.5)
abline(v = unique(input$true_MAF),lty = 2,lwd = 2,col = "magenta")

# Loci per copy number state
uCN = unique(input[,c("CN_1","CN_2")])
tmp_range = range(input$VAF)

for(ii in seq(nrow(uCN))){
  # ii = 3
  idx = which(input$CN_1 == uCN$CN_1[ii] & input$CN_2 == uCN$CN_2[ii])
  smart_hist(input$VAF[idx],breaks = 20,xlim = tmp_range,
    main = sprintf("Loci with (CN_1,CN_2) = (%s,%s)",uCN$CN_1[ii],uCN$CN_2[ii]),
    xlab = "Observed VAF",cex.lab = 1.5)
  abline(v = unique(input$true_MAF[idx]),lty = 2,lwd = 2,col = "magenta")
}

## ----know_config,fig.dim = c(5,5)---------------------------------------------
# Calculate true_unc_q, the unconstrained subclone proportions in cancer
true_unc_q = log(sim$q[-length(sim$q)] / sim$q[length(sim$q)])
true_unc_q

# Optimize
ith_out = ITH_optim(my_data = input,
  my_purity = sim$purity,
  pi_eps0 = 0,
  my_unc_q = true_unc_q,
  init_eS = eS[[3]][[2]])

# Estimate of unclassified loci
ith_out$pi_eps

# Model fit BIC
ith_out$BIC

# Estimate of subclone proportions in cancer
nSC = length(ith_out$q)
tmp_df = smart_df(SC = as.character(rep(seq(nSC),2)),
  CLASS = c(rep("Truth",nSC),rep("Estimate",nSC)),
  VALUE = c(sim$q,ith_out$q))
tmp_df

ggplot(data = tmp_df,mapping = aes(x = SC,y = VALUE,fill = CLASS)) +
  geom_bar(width = 0.5,stat = "identity",position = position_dodge()) +
  xlab("Subclone") + ylab("Proportion in Cancer") +
  theme(legend.position = "bottom",
    text = element_text(size = 20))

# Compare truth vs estimated/inferred

## Variant Allele Frequency
smoothScatter(input$true_MAF,ith_out$infer$infer_MAF,
  main = "VAF",xlab = "Truth",ylab = "Inferred",cex.lab = 1.5)
abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red")

smoothScatter(input$VAF,ith_out$infer$infer_MAF,
  main = "VAF",xlab = "Observed",ylab = "Inferred",cex.lab = 1.5)
abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red")

## Cellular prevalence
smoothScatter(input$true_CP,
  ith_out$infer$infer_CP,main = "Cellular Prevalence",
  xlim = c(0,1),ylim = c(0,1),
  xlab = "Truth",ylab = "Estimate",cex.lab = 1.5)
abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red")

## Allocation
smoothScatter(input$true_A,ith_out$infer$infer_A,
  main = "Allocation",xlab = "Truth",ylab = "Inferred",
  cex.lab = 1.5)
abline(a = 0,b = 1,lwd = 2,lty = 2,col = "red")

if( any(is.na(ith_out$infer$infer_A)) )
  cat("Some loci couldn't classify\n")

## Multiplicity
smart_table(Truth = input$true_M,
  Inferred = ith_out$infer$infer_M)

## ----unknown_opt,R.options = list(width = 200),fig.dim = c(8,6)---------------------------------------------------------------------------------------------------------------------------------------
grid_ith = grid_ITH_optim(
  my_data = input,
  my_purity = sim$purity,
  list_eS = eS,
  trials = 50,
  max_iter = 4e3)
names(grid_ith)

# Grid of solutions
grid_ith$GRID

# Order solution(s) based on BIC (larger BIC correspond to better fits)
gg = grid_ith$GRID
gg = gg[order(-gg$BIC),]
head(gg)

# true cancer proportions
sim$q

# true entropy
-sum(sim$q * log(sim$q))

## ----post,fig.dim = c(8,5)----------------------------------------------------
pp = vis_GRID(GRID = grid_ith$GRID)
print(pp)

## -----------------------------------------------------------------------------
gg = grid_ith$GRID
idx = which(gg$BIC == max(gg$BIC))
gg[idx,]

gg$cc[idx][1]

## -----------------------------------------------------------------------------
opt_entropy = gg$entropy[idx]
names(opt_entropy) = sprintf("Solution %s",idx)
opt_entropy

## -----------------------------------------------------------------------------
props = list()
for(jj in seq(length(idx))){
  opt_prop = gg$q[idx[jj]]
  opt_prop = as.numeric(strsplit(opt_prop,",")[[1]])
  names(opt_prop) = sprintf("SC%s",seq(length(opt_prop)))
  props[[sprintf("Solution %s",idx[jj])]] = opt_prop
}
props

## -----------------------------------------------------------------------------
for(jj in seq(length(idx))){
  cat(sprintf("Solution %s\n",idx[jj]))
  print(head(grid_ith$INFER[[idx[jj]]]))
}

## -----------------------------------------------------------------------------
opt_cps = list()
for(jj in seq(length(idx))){
  # jj = 1
  tab = table(smart_digits(grid_ith$INFER[[idx[jj]]]$infer_CP,4),
    grid_ith$INFER[[idx[jj]]]$infer_A)
  rownames(tab) = sprintf("CP = %s",rownames(tab))
  colnames(tab) = sprintf("ALLOC = %s",colnames(tab))
  opt_cps[[sprintf("Solution %s",idx[jj])]] = tab
}
opt_cps

## -----------------------------------------------------------------------------
opt = list()
for(jj in seq(length(idx))){
  # jj = 1
  opt_cc = gg$cc[idx[jj]]
  opt_kk = gg$kk[idx[jj]]
  mat = eS[[opt_cc]][[opt_kk]]
  dimnames(mat) = list(sprintf("ALLOC = %s",seq(opt_cc)),sprintf("SC%s",seq(opt_cc)))
  opt[[sprintf("Solution %s",idx[jj])]] = mat
}
opt

## ----sessInfo-----------------------------------------------------------------
sessionInfo()