## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
suppressMessages(library(mclust))
suppressMessages(library(magrittr))
suppressMessages(library(ggplot2))
library(dsb) 

## -----------------------------------------------------------------------------
cell = dsb::cells_citeseq_mtx
neg = dsb::empty_drop_citeseq_mtx

## -----------------------------------------------------------------------------
# log transformation
dlog = log(cell + 10)
nlog = log(neg + 10)

## -----------------------------------------------------------------------------
# calc mean and sd of background drops 
sd_nlog = apply(nlog, 1 , sd)
mean_nlog = apply(nlog, 1 , mean)


## -----------------------------------------------------------------------------

norm_adt = apply(dlog, 2, function(x) (x  - mean_nlog) / sd_nlog) 


## ----fig.width=8, fig.height=3.5----------------------------------------------
# check structure of denoised data with zero centering of background population 
r = 'deepskyblue3'
plist = list(theme_bw(), geom_density_2d(color = r), 
             geom_vline(xintercept = 0, linetype = 'dashed'),
             geom_hline(yintercept = 0, linetype = 'dashed'),
             xlab('CD3'), ylab('CD19')
             )

p1 = qplot(as.data.frame(t(norm_adt))$CD3_PROT, 
      as.data.frame(t(norm_adt))$CD19_PROT) + 
  plist +
  ggtitle('dsb step 1')
  
# raw data 
p2 = qplot(as.data.frame(t(cell))$CD3_PROT, 
      as.data.frame(t(cell))$CD19_PROT) + 
  plist + 
  ggtitle('RAW data') 
  

# log transformed data 
p3 = qplot(as.data.frame(t(dlog))$CD3_PROT, 
          as.data.frame(t(dlog))$CD19_PROT) + 
  plist + 
  ggtitle('log transformed') 
  
# examine distributions. 
cowplot::plot_grid(p1, p2, p3, nrow = 1) 

## -----------------------------------------------------------------------------
# fit 2 component Gaussian mixture to each cell 
cmd = apply(norm_adt, 2, function(x) {
  g = Mclust(x, G=2,  warn = TRUE , verbose = FALSE)  
  return(g) 
})


## ----fig.width=4.5, fig.height=3.5--------------------------------------------
cell.name = colnames(norm_adt)[2]
# get density 
cell2 = MclustDR(cmd[[2]])
plot(cell2, what = "density", main = 'test')
title(cex.main = 0.6, paste0('single cell:  ',cell.name,
                             '\n k = 2 component Gaussian Mixture '))


## -----------------------------------------------------------------------------
table(cell2$classification)

## ----fig.width=4.5, fig.height=3.5--------------------------------------------
# fit a mixture with between 1 and 6 components. 
m.list = lapply(as.list(1:6), function(k) Mclust(norm_adt[ ,2], G = k,
                                                 warn = FALSE, verbose = FALSE ))
# extract densities for k = 3 and k = 6 component models 
dr_3 = MclustDR(m.list[[3]])
dr_6 = MclustDR(m.list[[6]])

# visualize distributiion of protein populations with different k 
# in Gaussian mixture for a single cell 
plot.title = paste0('single cell:  ', cell.name, 
                    '\n k-component Gaussian Mixture ')

plot(dr_3,what = "density")
title(cex.main = 0.6, plot.title)


plot(dr_6,what = "density")
title(cex.main = 0.6, plot.title)


## ----fig.width=4.5, fig.height=3.5--------------------------------------------

plot(
  sapply(m.list, function(x) x$bic), type = 'b',
  ylab = 'Mclust BIC -- higher is better', 
  xlab = 'mixing components k in Gaussian Mixture model',
  main = paste0('single cell:  ', cell.name),
  col =r, pch = 18, cex = 1.4, cex.main = 0.8
  )

## ----fig.width=4.5, fig.height=3.5--------------------------------------------

# extract mu 1 as a vector 
mu.1 = unlist(
  lapply(
    cmd, function(x){x$parameters$mean[1] }
    ) 
)

# check distribution of the fitted value for µ1 across cells 
hist(mu.1, col = r,breaks = 30) 


## -----------------------------------------------------------------------------
# define isotype controls 
isotype.control.name.vec = c("Mouse IgG2bkIsotype_PROT", "MouseIgG1kappaisotype_PROT", 
                             "MouseIgG2akappaisotype_PROT", "RatIgG2bkIsotype_PROT" )
# construct noise matrix 
noise_matrix = rbind(mu.1, norm_adt[isotype.control.name.vec, ])

# transpose to fit PC 
tmat = t(noise_matrix)

# view the noise matrix on which to calculate pc1 scores. 
head(tmat)

## ----fig.width=4.5, fig.height=3.5--------------------------------------------
# calculate principal component 1 
g = prcomp(tmat, scale = TRUE)

# get the dsb technical component for each cell -- PC1 scores (position along PC1) for each cell 
head(g$x)  
head(g$rotation)
dsb.technical.component = g$x[ ,1]

hist(dsb.technical.component, breaks = 40, col = r)

## ----fig.width=4.5, fig.height=3.5--------------------------------------------

# constructs a matrix of 1 by number of columns in norm_adt1
covariate = as.matrix(dsb.technical.component)
design = matrix(1, ncol(norm_adt), 1)

# fit a linear model to solve the coefficient for each protein with QR decomposition. 
fit <- limma::lmFit(norm_adt, cbind(design, covariate))

# this subset just extracts the beta for each protein. 
beta <- fit$coefficients[, -(1:ncol(design)), drop = FALSE]
beta[is.na(beta)] <- 0

# beta is the coefficient for each prot. 
plot(beta, col = r , pch = 16, xlab = 'protein')


## -----------------------------------------------------------------------------
denoised_adt_2 = as.matrix(norm_adt) - beta %*% t(covariate)

## -----------------------------------------------------------------------------
# regress out the technical component using limma 
# note this is how limma (and dsb) calculates this 
denoised_adt = limma::removeBatchEffect(norm_adt, covariates = dsb.technical.component)


## -----------------------------------------------------------------------------
# default dsb call 
denoised_adt_3 = DSBNormalizeProtein(cell_protein_matrix = cell, 
                                     empty_drop_matrix = neg, 
                                     denoise.counts = TRUE, 
                                     isotype.control.name.vec = isotype.control.name.vec)


## ----fig.width=4.5, fig.height=3.5--------------------------------------------

qplot(as.data.frame(t(denoised_adt))$CD3_PROT, 
      as.data.frame(t(denoised_adt))$CD19_PROT) + 
  plist +
  ggtitle('dsb step 1 + 2 normalized and denoised') + 
  geom_vline(xintercept = 3.5, color = 'red') +
  geom_hline(yintercept = 3.5, color = 'red')