---
title: 'Benchmarking SignacX and SingleR with synovial flow cytometry data'
author: 'Mathew Chamberlain'
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Benchmarking SignacX and SingleR with flow-sorted data}
  %\VignetteEngine{knitr::rmarkdown_notangle}
  %\VignetteEncoding{UTF-8}
---

This vignette shows how to use Signac to annotate flow-sorted synovial cells by integrating SignacX with Seurat. We also compared Signac to another popular cell type annotation tool, SingleR. We start with raw counts. 

```{r setup0, include=FALSE}
all_times <- list()  # store the time for each chunk
knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      now <<- Sys.time()
    } else {
      res <- difftime(Sys.time(), now, units = "secs")
      all_times[[options$label]] <<- res
    }
  }
}))
knitr::opts_chunk$set(
  tidy = TRUE,
  tidy.opts = list(width.cutoff = 95),
  message = FALSE,
  warning = FALSE,
  time_it = TRUE
)
celltypes = readRDS(file = "fls/celltypes_amp_synovium.rds")
True_labels = readRDS(file = "fls/celltypes_amp_synovium_true.rds")
load(file = "fls/SingleR_Results.rda")
```

## Load data

Read the CEL-seq2 data.

```{r read CELSeq2, message = F, eval = F}
ReadCelseq <- function (counts.file, meta.file)
{
  E = suppressWarnings(readr::read_tsv(counts.file));
  gns <- E$gene;
  E = E[,-1]
  E = Matrix::Matrix(as.matrix(E), sparse = TRUE)
  rownames(E) <- gns
  E
}

counts.file = "./fls/celseq_matrix_ru10_molecules.tsv.gz"
meta.file = "./fls/celseq_meta.immport.723957.tsv"

E = ReadCelseq(counts.file = counts.file, meta.file = meta.file)
M = suppressWarnings(readr::read_tsv(meta.file))

# filter data based on depth and number of genes detected
kmu = Matrix::colSums(E != 0)
kmu2 = Matrix::colSums(E)
E = E[,kmu > 200 & kmu2 > 500]

# filter by mitochondrial percentage
logik = grepl("^MT-", rownames(E))
MitoFrac = Matrix::colSums(E[logik,]) / Matrix::colSums(E) * 100
E = E[,MitoFrac < 20]
```

## SingleR

```{r filter celseq, message = F, eval = F}
require(SingleR)
data("hpca")
Q = SingleR(sc_data = E, ref_data = hpca$data, types = hpca$main_types, fine.tune = F, numCores = 4)
save(file = "fls/SingleR_Results.rda", Q)
True_labels = M$type[match(colnames(E), M$cell_name)]
saveRDS(True_labels, file = "fls/celltypes_amp_synovium_true.rds")
```

## Seurat 

Start with the standard pre-processing steps for a Seurat object.

```{r setupSeurat, message = F, eval = F}
library(Seurat)
```

Create a Seurat object, and then perform SCTransform normalization. Note:

* You can use the legacy functions here (i.e., NormalizeData, ScaleData, etc.), use SCTransform or any other normalization method (including no normalization). We did not notice a significant difference in cell type annotations with different normalization methods.
* We think that it is best practice to use SCTransform, but it is not a necessary step. Signac will work fine without it.

```{r Seurat, message = T, eval = F}
# load data
synovium <- CreateSeuratObject(counts = E, project = "FACs")

# run sctransform
synovium <- SCTransform(synovium, verbose = F)
```

Perform dimensionality reduction by PCA and UMAP embedding. Note:

* Signac actually needs these functions since it uses the nearest neighbor graph generated by Seurat.

```{r Seurat 2, message = T, eval = F}
# These are now standard steps in the Seurat workflow for visualization and clustering
synovium <- RunPCA(synovium, verbose = FALSE)
synovium <- RunUMAP(synovium, dims = 1:30, verbose = FALSE)
synovium <- FindNeighbors(synovium, dims = 1:30, verbose = FALSE)
```

## SignacX

```{r setup2, message = F, eval = F}
library(SignacX)
```

Generate Signac labels for the Seurat object. Note:

* Optionally, you can do parallel computing by setting num.cores > 1 in the Signac function.
* Run time is ~10 minutes for <100,000 cells.

```{r Signac, message = T, eval = F}
# Run Signac
labels <- Signac(synovium, num.cores = 4)
celltypes = GenerateLabels(labels, E = synovium)
```

## Compare SignacX and SingleR with FACs labels

```{r Seurat Visualization 1110, message = F, echo = F}
xx = celltypes$CellTypes
xx = as.character(xx)
xx[xx == "Plasma.cells"] = "B"
xx[xx == "NonImmune"] = as.character(celltypes$CellStates)[xx == "NonImmune"]
xx[xx == "B"] = "B"
xx[xx == "Fibroblasts"] = "F"
xx[xx == "MPh"] = "M"
xx[xx == "TNK"] = "T"
xx[celltypes$CellStates == "NK"] = "NK"
xy = xx
True_labels[True_labels == "B cell"] = "B"
True_labels[True_labels == "Fibroblast"] = "F"
True_labels[True_labels == "Monocyte"] = "M"
True_labels[True_labels == "T cell"] = "T"
tabsignac = table(FACs = True_labels, Signac = xy)
```

SignacX (rows are FACs labels, columns are SignacX)

```{r Compare 22, echo = F, eval = T}
knitr::kable(tabsignac, format = "html")
```

```{r Seurat Visualization 10, message = F, echo = F}
xx = Q$labels
xx[xx %in% c("Macrophage", "DC", "Monocyte", "Platelets", "Neutrophils")] = "M"
xx[xx %in% c("B_cell", "Pre-B_cell_CD34-", "Pro-B_cell_CD34+")] = "B"
xx[xx == "Fibroblasts"] = "F"
xx[xx == "T_cells"] = "T"
xx[xx %in% c("Astrocyte", "Osteoblasts", "Tissue_stem_cells", "Smooth_muscle_cells", "MSC")] = "NonImmune"
xx[xx == "NK_cell"] = "NK"
xx[xx == "Chondrocytes"] = "Chondr."
tab = table(FACs = True_labels, SingleR = xx)
```

SingleR (rows are FACs labels, columns are SingleR)

```{r Compare 223, echo = F, eval = T}
knitr::kable(tab, format = "html")
```

Note:

* These data were flow-sorted for CD3, and therefore lack NK cells. Signac detected no NK cells.
* F = Fibroblasts; M = monocytes; T = T cells; B = B cells
* Note that SingleR inaccurately annotated the majority of fibroblast cells as chondrocytes (Chondr.) and non immune cells, and also significantly identified NK cells (even though the data were flow-sorted for CD3, and thus lacked NK cells).

Signac accuracy

```{r Seurat Visualization 11110, message = F, echo = T}
logik = xy != "Unclassified"
Signac_Accuracy = round(sum(xy[logik] == True_labels[logik]) / sum(logik) * 100, 2)
Signac_Accuracy
```

SingleR accuracy

```{r Seurat Visualization 111110, message = F, echo = T}
SingleR_Accuracy = round(sum(xx == True_labels) / sum(logik) * 100, 2)
SingleR_Accuracy
```

Save results

```{r save results, message = F, eval = F}
saveRDS(synovium, file = "synovium_signac.rds")
saveRDS(celltypes, file = "synovium_signac_celltypes.rds")
```

```{r save.times, include = FALSE, eval = F}
write.csv(x = t(as.data.frame(all_times)), file = "fls/tutorial_times_signac-seurat_singler_AMP_RA.csv")
```

<details>
  <summary>**Session Info**</summary>
```{r, echo=FALSE}
sessionInfo()
```
</details>