---
title: 'SpaCOAP: mouse spleen dataset'
author: "Wei Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{SpaCOAP: mouse spleen dataset}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---
  
```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```
This vignette introduces the SpaCOAP workflow for the analysis of  spatial multi-omics dataset.


We demonstrate the use of SpaCOAP to one Spleen SPOTS data that are [here](https://github.com/feiyoung/SpaCOAP/tree/master/vignettes_data/), which can be downloaded to the current working path by the following command:
```{r eval=FALSE}
url1 <- "https://github.com/feiyoung/SpaCOAP/tree/master/vignettes_data/"
rna_object <- "seu_rna_over_Spleen.RDS?raw=true"
download.file(paste0(url1, rna_object),"seu_rna_over_Spleen.RDS",mode='wb')
protein_object <- "seu_adt_over_Spleen.RDS?raw=true"
download.file(paste0(url1,protein_object), 'seu_adt_over_Spleen.RDS', mode='wb')
## download  annotation
download.file(paste0(url1, "cell_clusters_anno.rds?raw=true"), "cell_clusters_anno.rds", "wb")
```

### Data preparation
Then load to R. Here, we only focus one section.
```{r  eval =FALSE}
seu_rna_over <- readRDS("./seu_rna_over_Spleen.RDS")
seu_adt_over <- readRDS("./seu_adt_over_Spleen.RDS")
load("./cell_clusters_anno.rds")
```

The package can be loaded with the command:
```{r  eval =FALSE}
library(SpaCOAP) #
```

Define some functions
```{r  eval =FALSE}
searchRadius <- function(pos, lower.med=8, upper.med=10, radius.upper= NULL){
  if (!inherits(pos, "matrix"))
    stop("method is only for  matrix object!")
  
  
  ## Automatically determine the upper radius
  n_spots <- nrow(pos)
  idx <- sample(n_spots, min(100, n_spots))
  dis <- dist(pos[idx,])
  if(is.null(radius.upper)){
    #radius.upper <- max(dis)
    radius.upper <- sort(dis)[20] ## select the nearest 20 spots.
  }
  radius.lower <- min(dis[dis>0])
  Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=radius.upper)
  Med <- summary(Matrix::rowSums(Adj_sp))['Median']
  if(Med < lower.med) stop("The radius.upper is too smaller that cannot find median neighbors greater than 4.")
  start.radius <- 1
  Med <- 0
  message("Find the adjacency matrix by bisection method...")
  maxIter <- 30
  k <- 1
  while(!(Med >= lower.med && Med <=upper.med)){ # ensure that each spot has about 4~6 neighborhoods in median.
    
    Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=start.radius)
    Med <- summary(Matrix::rowSums(Adj_sp))['Median']
    if(Med < lower.med){
      radius.lower <- start.radius
      start.radius <- (radius.lower + radius.upper)/2
    }else if(Med >upper.med){
      radius.upper <- start.radius
      start.radius <- (radius.lower + radius.upper)/2
    }
    message("Current radius is ", round(start.radius, 2))
    message("Median of neighborhoods is ", Med)
    if(k > maxIter) {
      message("Reach the maximum iteration but can not find a proper radius!")
      break;
    }
    k <- k + 1
  }
  
  return(start.radius)
}

acc_fun <- function(y1, y2){
  n1 <- length(unique(y1))
  n2 <- length(unique(y2))
  if(n1<n2){ ## ensure n1> n2
    a <- y1
    y1 <- y2
    y2 <- a
    n1 <- length(unique(y1))
    n2 <- length(unique(y2))
  }
  cm <- as.matrix(table(Actual = y1, Predicted = y2))
  rnames <-row.names(cm)
  cnames <- colnames(cm)
  union_names <- union(rnames, cnames)
  n <- length(union_names)
  cm_new <- matrix(0, n, n)
  row.names(cm_new) <- colnames(cm_new) <- union_names
  for(r in 1:n2){
    cm_new[rnames,cnames[r]] <- cm[rnames,cnames[r]]
  }
  
  sum(diag(cm_new)) / length(y1)
}

kappa_fun <- function(y1, y2){
  require(irr)
  dat <- data.frame(y1, y2)
  k_res <- kappa2(dat)
  k_res$value
}
```

## View the DLPFC data
Wrap the data matrix from the SeuratObject, including the RNA expression count matrix `X_count`, covraite matrix `H` associated with protein markters, control variables (here only an intercept term) `Z`,  and spatial coordinate  matrix `pos`.
```{r  eval =FALSE}

X_count <- Matrix::t(seu_rna_over[["RNA"]][seu_rna_over[['RNA']]@var.features,])
X_count <-  as.matrix(X_count)
H <- t(as.matrix(seu_adt_over[["ADT"]]@data))
Z <- matrix(1, nrow(H), 1)
pos <- cbind(seu_rna_over$X0, seu_rna_over$X1)


radius_use <- searchRadius(pos, radius.upper = NULL)
set.seed(1)
n_spots <- nrow(pos)
idx <- sample(n_spots, min(100, n_spots))
dis <- dist(pos[idx,])
Adj_sp <-  SpaCOAP:::getneighbor_weightmat(pos, radius = radius_use, width=median(dis))
```

### Determine the structure dimension

Next, we select the number of factors and the rank of regreesion coefficient matrix using the proposed criterion.
```{r  eval =FALSE}
q_max <- 20
d <- ncol(H)
rank_max <- d
tic <- proc.time()
reslist_max <- SpaCOAP(X_count, Adj_sp,  H,  Z,
                   rank_use = rank_max, q=q_max, epsELBO = 1e-8,  maxIter = 30)
toc <- proc.time()
time_spacoap_max <- toc[3] - tic[3]
```

We apply the criterion, taking into account that the real data does not necessarily originate from our model. Therefore, we opt for a conservative approach that retains more information.
```{r  eval =FALSE}
threshold=c(1e-15, 1e-20)
thre1 <- threshold[1]
beta_svalues <- svd(reslist_max$bbeta)$d
beta_svalues <- beta_svalues[beta_svalues>thre1]
ratio1 <- beta_svalues[-length(beta_svalues)] / beta_svalues[-1]
ratio1[1:10]
## Here, we set hr = 9 rather 1 to retain more information

thre2 <- threshold[2]
B_svalues <- svd(reslist_max$B)$d
B_svalues <- B_svalues[B_svalues>thre2]
ratio_fac <- B_svalues[-length(B_svalues)] / B_svalues[-1]
ratio_fac[1:6]
# [1] 6.123909e+00 2.749414e+00 1.376498e+00 1.314487e+00 3.310699e+14
# Here, we choose q=5 since huge decrease of singular values happen in q=5.
hq <- 5
```

### Fitting SpaCOAP
First, we run the proposed SpaCOAP method.
```{r  eval =FALSE}
hr <- 9;hq <- 5
featureList <- list()
tic <- proc.time()
reslist <- SpaCOAP(X_count, Adj_sp, H=H, Z= Z,  
                   rank_use = hr, q=hq, epsELBO = 1e-8)
toc <- proc.time()
time_spacoap <- toc[3] - tic[3]
Matrix::rankMatrix(reslist$bbeta)
svd_x <- svd(reslist$bbeta, nu = hr, nv=hr)
H_spacoap <- H %*% svd_x$v
(R2_spacoap <- ProFAST::get_r2_mcfadden(embeds= cbind(H_spacoap, reslist$F), y=as.factor(cell_clusters)))
featureList[['SpaCOAP']] <- cbind(H_spacoap, reslist$F)


```

  


### Compare SpaCOAP and other methods

First, we run COAP and calculate the MacFadden's R-square.
```{r  eval =FALSE}
##COAP
library(COAP)
tic <- proc.time()
res_coap <- RR_COAP(X_count, Z = cbind(Z, H), rank_use= 2+hr, q=hq, 
                    epsELBO = 1e-7, maxIter = 30)
toc <- proc.time()
time_coap <- toc[3] - tic[3]
save(res_coap, time_coap, file='reslist_time_coap.rds')
svd_x <- svd(res_coap$bbeta[,-c(1)], nu = hr, nv=hr)
H_coap <- H %*% svd_x$v
(R2_coap <- ProFAST::get_r2_mcfadden(embeds= cbind(res_coap$H, H_coap), y=as.factor(cell_clusters)))
featureList[['COAP']] <- cbind(res_coap$H, H_coap)
```

Second, we run MRRR and calculate the MacFadden's R-square.
```{r  eval = FALSE}
## MRRR
mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()),
                     familygroup=rep(1,ncol(Y)), epsilon = 1e-4, sv.tol = 1e-2,
                     maxIter = 2000, trace=TRUE, truncflag=FALSE, trunc=500){
  # epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE
  # Y <- X_count; X <- cbind(Z, H); rank0 = r + ncol(Z)
  
  require(rrpack)
  
  n <- nrow(Y); p <- ncol(Y)
  
  if(!is.null(q)){
    rank0 <- rank0+q
    X <- cbind(X, diag(n))
  }
  if(truncflag){
    ## Trunction
    Y[Y>trunc] <- trunc
    
  }
  
  svdX0d1 <- svd(X)$d[1]
  init1 = list(kappaC0 = svdX0d1 * 5)
  offset = NULL
  control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter,
                 trace = trace, gammaC0 = 1.1, plot.cv = TRUE,
                 conv.obj = TRUE)
  fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup,
                   penstr = list(penaltySVD = "rankCon", lambdaSVD = 1),
                   control = control, init = init1, maxrank = rank0)
  
  return(fit.mrrr)
}

tic <- proc.time()
res_mrrr <- mrrr_run(X_count, cbind(Z,H), rank0=hr+ncol(Z), q=hq)
str(res_mrrr)
toc <- proc.time()
time_mrrr <- toc[3] - tic[3]
hbbeta_mrrr <-t(res_mrrr$coef[1:ncol(cbind(Z,H)), ])
svd_x <- svd(hbbeta_mrrr[,-c(1)], nu = hr, nv=hr)
H_mrrr <- H %*% svd_x$v
Theta_hb <- (res_mrrr$coef[(ncol(cbind(Z,H))+1): (nrow(cbind(Z,H))+ncol(cbind(Z,H))), ])
svdTheta <- svd(Theta_hb, nu=hq, nv=hq)
F_mrrr <- svdTheta$u
(R2_mrrr <- ProFAST::get_r2_mcfadden(embeds= cbind(H_mrrr, F_mrrr), y=as.factor(cell_clusters)))
featureList[['MRRR']] <- cbind(H_mrrr, F_mrrr)
```

Finally, we execute FAST and compute the MacFadden's R-squared. It is worth noting that  we refrain from comparing with PLNPCA in this demo due to its time-consuming nature.
```{r  eval =FALSE}
fast_run <- function(X_count, Adj_sp, q, verbose=TRUE, epsELBO=1e-8){
  require(ProFAST)

  reslist <- FAST_run(XList = list(X_count), 
                      AdjList = list(Adj_sp), q = q, fit.model = 'poisson', 
                      verbose=verbose, epsLogLik=epsELBO)
  reslist$hV <- reslist$hV[[1]]
  return(reslist)
}
tic <- proc.time()
res_fast <- fast_run(X_count, Adj_sp, q=hq, verbose=TRUE, epsELBO=1e-8)
toc <- proc.time()
time_fast <- toc - tic
(R2_fast <- ProFAST::get_r2_mcfadden(embeds= res_fast$hV, y=as.factor(cell_clusters)))
featureList[['FAST']] <- res_fast$hV

```

### Summarize the metrics
We evaluate the correlation between the low-dimensional representations learned by SpaCOAP and other methods with the annotated cell clusters using two metrics. The first metric is the adjusted MacFadden's  $\mathrm{R}^2$,  referred to as $\mathrm{MacR}^2$. This measure quantifies the amount of biological information captured by the representations, where a higher value signifies superior performance in representation learning.

```{r  eval =FALSE}
R2List <- list()
cell_label <- cell_clusters
for(im in 1: length(featureList)){
  message("im = ", im)
  R2List[[im]] <- ProFAST::get_r2_mcfadden(embeds= featureList[[im]], y=cell_label)
}
names(R2List) <- names(featureList)
```
```{r  eval =FALSE}
R2Vec <- unlist(R2List)
names(R2Vec) <- names(R2List)
barplot(R2Vec, ylim=c(0, 0.8))
```
Finally, we employ a classification model trained through randomForest, leveraging the cell clusters and the learned low-dimensional representations by SpaCOAP and other methods. We then compare the prediction performance of this model. To achieve this, we randomly divide the samples into training and testing data with a ratio of  $7:3$. We evaluate the model's performance using prediction accuracy (ACC) and Cohen's Kappa on the testing data. In contrast of ACC, Kappa offers a more robust measure of accuracy, as it takes into account the possibility that
the agreement occurs by chance. We repeat this division process ten times to ensure the reliability of our findings.

```{r  eval =FALSE}
N <- 10
n <- length(cell_label)
methodNames <- c("SpaCOAP", "COAP",  "MRRR", "FAST")
n_methods <- length(methodNames)
metricList <- list(ACC = matrix(NA,N, n_methods), Kappa = matrix(NA,N, n_methods))
for(ii in 1: length(metricList)) colnames(metricList[[ii]]) <- methodNames
library(randomForest)
for(i in 1:N){
  # i <- 1
  message("i = ", i)
  set.seed(i)
  idx_train <- sort(sample(n, round(n*0.7)))
  idx_test <- sort(setdiff(1:n, idx_train))
  rf_spacoap <- randomForest(featureList[['SpaCOAP']][idx_train,], y=cell_label[idx_train])
  hy_spacoap <- predict(rf_spacoap, newdata=featureList[['SpaCOAP']][idx_test,])
  metricList$ACC[i,1] <- acc_fun(hy_spacoap, cell_label[idx_test])
  metricList$Kappa[i,1] <- kappa_fun(hy_spacoap, cell_label[idx_test])
  
  rf_coap <- randomForest(featureList[['COAP']][idx_train,], y=cell_label[idx_train])
  hy_coap <- predict(rf_coap, newdata=featureList[['COAP']][idx_test,])
  metricList$ACC[i,2] <- acc_fun(hy_coap, cell_label[idx_test])
  metricList$Kappa[i,2] <- kappa_fun(hy_coap, cell_label[idx_test])
 
  colnames(featureList[['MRRR']]) <- paste0("MRRR", 1:ncol(featureList[['MRRR']]))
  rf_MRRR <- randomForest(featureList[['MRRR']][idx_train,], y=cell_label[idx_train])
  hy_MRRR <- predict(rf_MRRR, newdata=featureList[['MRRR']][idx_test,])
  metricList$ACC[i,3] <- acc_fun(hy_MRRR, cell_label[idx_test])
  metricList$Kappa[i,3] <- kappa_fun(hy_MRRR, cell_label[idx_test])

  colnames(featureList[['FAST']]) <- paste0("FAST", 1:ncol(featureList[['FAST']]))
  rf_FAST <- randomForest(featureList[['FAST']][idx_train,], y=cell_label[idx_train])
  hy_FAST <- predict(rf_FAST, newdata=featureList[['FAST']][idx_test,])
  metricList$ACC[i,4] <- acc_fun(hy_FAST, cell_label[idx_test])
  metricList$Kappa[i,4] <- kappa_fun(hy_FAST, cell_label[idx_test])
}

DT::datatable(t(round(sapply(metricList, colMeans, na.rm=T),2) ))

```

<details>
<summary>**Session Info**</summary>
```{r}
sessionInfo()
```
</details>