## ----setup, echo = FALSE, eval = TRUE, include = FALSE------------------------
# Set global options
knitr::opts_chunk$set(eval = TRUE, include = TRUE, echo = TRUE, message = FALSE, warning = FALSE, error = FALSE, dpi = 72)
# Define custom hooks for handling errors, warnings, and messages during knitting
knitr::knit_hooks$set(
   error = function(x, options){
     paste('\n\n<div class="alert alert-danger">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')},
   warning = function(x, options){
     paste('\n\n<div class="alert alert-warning">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')},
   message = function(x, options){
     paste('\n\n<div class="alert alert-info">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')})

#Load AntibodyForests functions
devtools::load_all(".")

## ----eval = FALSE-------------------------------------------------------------
#  #Install from CRAN
#  install.packages("Platypus")
#  install.packages("AntibodyForests")

## ----eval = FALSE-------------------------------------------------------------
#  #Load the libraries
#  library(Platypus)
#  library(AntibodyForests)

## ----installation, echo = FALSE, eval = FALSE---------------------------------
#  # List of CRAN packages required to run all functions
#  CRAN_packages <- c("Platypus", "alakazam", "ape", "base",
#                     "base64enc", "BiocManager", "dplyr",
#                     "grDevices", "gtools", "igraph",
#                     "jsonlite", "knitr", "parallel",
#                     "phangorn", "scales", "stats",
#                     "stringdist", "stringr", "utils")
#  
#  # List of Bioconductor packages required to run all functions
#  Bioconductor_packages <- c("Biostrings", "msa")
#  
#  # Iterate through packages
#  for(package in c(CRAN_packages, Bioconductor_packages)){
#    # Check if the current packages is already installed
#    if(!requireNamespace(package, quietly = TRUE)){
#      # Install CRAN packages if not installed
#      if(package %in% CRAN_packages){install.packages(package)}
#      # Install Bioconductor packages if not installed
#      if(package %in% Bioconductor_packages){BiocManager::install(package)}
#    }
#  }
#  

## ----quick-start1, eval = FALSE-----------------------------------------------
#  # Import 10x Genomics output files into VDJ dataframe and only keep cells with one VDJ and one VJ transcript
#  # For each clone, trim germline sequence and replace CDR3 region in germline with most-frequently observed CDR3 sequence
#  VDJ_OVA <- Platypus::VDJ_build(VDJ.directory = "10x_output/VDJ/",
#                       remove.divergent.cells = TRUE,
#                       complete.cells.only = TRUE,
#                       trim.germlines = TRUE)
#  
#  # Build lineage trees for all clones present in the VDJ dataframe with the default algorithm
#  AntibodyForests_OVA_default <- Af_build(VDJ = VDJ_OVA,
#                                            construction.method = "phylo.network.default")

## ----quick-start2, eval = F---------------------------------------------------
#  # Plot one of the lineage trees, constructed with the `phylo.network.default' method
#  Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_default,
#                       sample = "S1",
#                       clonotype = "clonotype2")

## ----quick-start3, eval = F---------------------------------------------------
#  # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree
#  out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default,
#                                       min.nodes = 5,
#                                       distance.method = "euclidean",
#                                       distance.metrics = c("mean.depth", "sackin.index", "spectral.density"),
#                                       clustering.method = "mediods",
#                                       visualization.methods = "PCA",
#                                       plot.label = T)
#  
#  # Analyze the difference between the clusters based on the calculated sackin.index
#  plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default,
#                     clusters = out$clustering,
#                     metrics = c("spectral.density"),
#                     min.nodes = 5,
#                     significance = T)
#  

## ----quick-start4, eval=FALSE-------------------------------------------------
#  # Plot the PCA colored on the assigned clusters
#  out$plots$PCA_clusters

## ----quick-start5, eval=FALSE-------------------------------------------------
#  # Plot the different number of modalities between the clusters
#  plots$modalities

## ----vdj_build-example-3-code, eval = FALSE-----------------------------------
#  # Read in data, keep only complete, non-divergent cells and trim germline sequences
#  VDJ_OVA <- Platypus::VDJ_build(VDJ.directory = "10x_output/VDJ/",
#                       remove.divergent.cells = TRUE,
#                       complete.cells.only = TRUE,
#                       trim.germlines = TRUE,
#                       fill.germline.CDR3 = TRUE)

## ----vdj_build-example-3-message, echo = FALSE, results = "asis"--------------
# Print number of filtered cells/barcodes 
warning("Please be aware of the number of cells excluded, when 'remove.divergent.cells' or 'complete.cells.only' is set to TRUE:\n")
print(knitr::kable(data.frame(divergent.cells = c(360, 295, 79, 183 ,526), 
                              incomplete.cells = c(708, 503, 387, 380, 1113),
                              row.names = c("S1", "S2", "S3", "S4", "S5"))))

## ----vdj_build-example-3-df, echo = FALSE-------------------------------------
# Load the example VDJdataframe
load("objects/VDJ_OVA_dummy_3.RData")
knitr::kable(head(VDJ_OVA_dummy_3[,c("sample_id", "barcode", "isotype", "clonotype_id", "clonotype_frequency", "VDJ_vgene", "VDJ_jgene")], 10))

## ----AntibodyForests-example-default, eval = FALSE----------------------------
#  # Infer lineage trees for all clones using the 'phylo.network.default' construction method and using all default parameter settings
#  AntibodyForests_OVA_default <- Af_build(VDJ = VDJ_OVA,
#                                                 sequence.columns = c("VDJ_sequence_nt_trimmed", "VJ_sequence_nt_trimmed"),
#                                                 germline.columns = c("VDJ_germline_nt_trimmed", "VJ_germline_nt_trimmed"),
#                                                 concatenate.sequences = FALSE,
#                                                 node.features = "isotype",
#                                                 construction.method = "phylo.network.default",
#                                                 string.dist.metric = "lv",
#                                                 resolve.ties = c("max.expansion", "close.germline.dist", "close.germline.edges", "random"),
#                                                 parallel = TRUE)

## ----AntibodyForests-example-ML, eval = FALSE---------------------------------
#  # Infer lineage trees for all clones using the 'phylo.tree.ml' construction method and using all default parameter settings
#  AntibodyForests_OVA_ml <- Af_build(VDJ = VDJ_OVA,
#                                            sequence.columns = c("VDJ_sequence_nt_trimmed", "VJ_sequence_nt_trimmed"),
#                                            germline.columns = c("VDJ_germline_nt_trimmed", "VJ_germline_nt_trimmed"),
#                                            concatenate.sequences = FALSE,
#                                            construction.method = "phylo.tree.ml",
#                                            dna.model = "all",
#                                            remove.internal.nodes = "connect.to.parent",
#                                            parallel = TRUE)

## ----AntibodyForests-example-IgPhyML, eval = FALSE----------------------------
#  # Infer lineage trees for all clones using the 'phylo.tree.IgPhyML' construction method and using all default parameter settings
#  AntibodyForests_OVA_IgPhyML <- Af_build(VDJ = VDJ_OVA,
#                                                 sequence.columns = "VDJ_sequence_nt_trimmed",
#                                                 germline.columns = "VDJ_germline_nt_trimmed",
#                                                 construction.method = "phylo.tree.IgPhyML",
#                                                 IgPhyML.output.file = "data/OVA_scRNA-seq_data/VDJ/airr_rearrangement_igphyml-pass.tab",
#                                                 remove.internal.nodes = "connect.to.parent",
#                                                 parallel = TRUE)

## ----AntibodyForests_plot_ML_with_IN_example, eval = FALSE--------------------
#  # Plot lineage tree, constructed with the `phylo.tree.ml' method, without internal nodes
#  Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_ml,
#                       sample = "S1",
#                       clonotype = "clonotype8",
#                       show.inner.nodes = TRUE,
#                       label.by = "name",
#                       color.by = "isotype",
#                       node.size = "expansion",
#                       main.title = "Lineage tree - ML",
#                       sub.title = "OVA - S1 - clonotype8")

## ----AntibodyForests_plot_ML_without_IN_example, eval = FALSE-----------------
#  # Plot lineage tree, constructed with the `phylo.tree.ml' method, without internal nodes
#  Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_ml,
#                       sample = "S1",
#                       clonotype = "clonotype8",
#                       show.inner.nodes = FALSE,
#                       label.by = "name",
#                       color.by = "isotype",
#                       node.size = "expansion",
#                       main.title = "Lineage tree - ML",
#                       sub.title = "OVA - S1 - clonotype8")

## ----Af_sync_nodes, eval= FALSE-----------------------------------------------
#  # Synchronize the nodes of the different AntibodyForests objects
#  AntibodyForests_OVA_IgPhyML <- Af_sync_nodes(reference = AntibodyForests_OVA_default,
#                                                      subject = AntibodyForests_OVA_IgPhyML)
#  AntibodyForests_OVA_mst <- Af_sync_nodes(reference = AntibodyForests_OVA_default,
#                                  subject = AntibodyForests_OVA_mst)
#  AntibodyForests_OVA_nj <- Af_sync_nodes(reference = AntibodyForests_OVA_default,
#                                  subject = AntibodyForests_OVA_nj)
#  AntibodyForests_OVA_mp <- Af_sync_nodes(reference = AntibodyForests_OVA_default,
#                                  subject = AntibodyForests_OVA_mp)
#  AntibodyForests_OVA_ml <- Af_sync_nodes(reference = AntibodyForests_OVA_default,
#                                  subject = AntibodyForests_OVA_ml)

## ----compare_methods_example2, eval = FALSE-----------------------------------
#  # Calculate the euclidean distance between different trees based on the GBLD
#  out <- Af_compare_methods(input = list("Default" = AntibodyForests_OVA_default,
#                                         "MST" = AntibodyForests_OVA_mst,
#                                         "NJ" = AntibodyForests_OVA_nj,
#                                         "MP" = AntibodyForests_OVA_mp,
#                                         "ML" = AntibodyForests_OVA_ml,
#                                         "IgPhyML" = AntibodyForests_OVA_IgPhyML),
#                            min.nodes = 10,
#                            distance.method = "GBLD",
#                            visualization.methods = "heatmap",
#                            include.average = T)
#  # Plot the average heatmap
#  out$average$Heatmap

## ----metric_dataframe1, eval = FALSE------------------------------------------
#  # Calculate the metric dataframe for the AntibodyForests object
#  metric_df <- Af_metrics(input = AntibodyForests_OVA_default,
#                          min.nodes = 10,
#                          metrics = c("mean.depth", "sackin.index", "spectral.density"))
#  

## ----metric_dataframe2--------------------------------------------------------
load("objects/metric_df.RData")

## ----metric_dataframe3, echo=FALSE--------------------------------------------
# Print the dataframe
DT::datatable(data = metric_df,
              rownames = TRUE,
              options = list(scrollX = TRUE))

## ----compare_distance_isotype, eval=FALSE-------------------------------------
#  # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree
#  Af_distance_boxplot(AntibodyForests_object = AntibodyForests_OVA_default,
#                      node.feature = "isotype",
#                      distance = "node.depth",
#                      min.nodes = 8,
#                      groups = c("IgG1", "IgA"),
#                      text.size = 16,
#                      unconnected = T,
#                      significance = T)

## ----compare_within_repertoire_example1, eval = FALSE-------------------------
#  # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree
#  out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default,
#                                       min.nodes = 8,
#                                       distance.method = "euclidean",
#                                       distance.metrics = c("mean.depth", "sackin.index", "spectral.density"),
#                                       clustering.method = "mediods",
#                                       visualization.methods = "PCA")
#  # Plot the PCA colored on the assigned clusters
#  out$plots$PCA_clusters

## ----cluster_metrics_example1, eval = FALSE-----------------------------------
#  # Analyze the difference between the clusters based on the calculated sackin.index
#  plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default,
#                     clusters = out$clustering,
#                     metrics = c("sackin.index", "spectral.density"),
#                     min.nodes = 8,
#                     significance = T)
#  
#  plots$sackin.index
#  plots$modalities

## ----cluster_node_features_example1, eval = FALSE-----------------------------
#  # Analyze the difference between the clusters based on the calculated sackin.index
#  plots <- Af_cluster_node_features(input = AntibodyForests_OVA_default,
#                     clusters = out$clustering,
#                     features = "isotype",
#                     fill = "max",
#                     significance = T)
#  
#  plots$isotype

## ----compare_within_repertoire_example2, eval = FALSE-------------------------
#  # Cluster the trees in the repertoire based on the Jensen-Shannon divergence between the Spectral Density profiles
#  out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default,
#                                       min.nodes = 8,
#                                       distance.method = "jensen-shannon",
#                                       clustering.method = "mediods",
#                                       visualization.methods = "heatmap")
#  # Plot the heatmap
#  out$plots$heatmap_clusters

## ----cluster_metrics_example2, eval=FALSE-------------------------------------
#  # Analyze the difference between the clusters based on the calculated sackin.index
#  plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default,
#                     clusters = out$clustering,
#                     metrics = "spectral.density",
#                     min.nodes = 8,
#                     significance = T)
#  
#  plots$spectral.asymmetry
#  plots$modalities

## ----compare_across_repertoire_example1, eval=FALSE---------------------------
#  #Calculate the difference between the two groups of repertoires
#  boxplots <- Af_compare_across_repertoires(list("Blood" = AntibodyForests_example_blood,
#                                                 "Lymph Nodes" = AntibodyForests_example_LN),
#                                            metrics = c("betweenness", "mean.depth"), significance = T)
#  
#  #Plot the boxplots
#  boxplots$betweenness
#  boxplots$mean.depth

## ----add_pseudolikelihood_example, eval=FALSE---------------------------------
#  # Add pseudolikelihood as node feature to the AntibodyForests object
#  AntibodyForests_OVA_default <- Af_add_node_feature(AntibodyForests_object = AntibodyForests_OVA_default,
#                                                                  feature.df = VDJ_OVA,
#                                                                  feature.names = "pseudolikelihood")

## ----plot_pseudolikelihood_example1, eval=FALSE-------------------------------
#  # Plot pseudolikelihood against distance to the germline
#  Af_distance_scatterplot(AntibodyForests_object = AntibodyForests_OVA_default,
#                                       node.features = "pseudolikelihood",
#                                       distance = "edge.length",
#                                       min.nodes = 5,
#                                       correlation = "pearson",
#                                       color.by = "sample")

## ----get_sequences_probability_example, eval=FALSE----------------------------
#  #Create a dataframe of the sequences with a sequence ID containing the sample, clonotype, and node number
#  df <- Af_get_sequences(AntibodyForests_object = AntibodyForests_OVA_default,
#                                                 sequence.name = "VDJ_sequence_aa_trimmed",
#                                                 min.edges = 1)
#  #Save the data frame as CSV to serve as input for the PLM-pipeline
#  write.csv(df, file = "/directory/PLM_input.csv", row.names = FALSE)

## ----probability_matrix_example, eval=FALSE-----------------------------------
#  #Create a PLM dataframe
#  PLM_dataframe <- Af_PLM_dataframe(AntibodyForests_object = AntibodyForests_OVA_default,
#                                                 sequence.name = "VDJ_sequence_aa_trimmed",
#                                                 path_to_probabilities = "/directory/ProbabilityMatrix")

## ----load_plm_dataframe, echo=FALSE-------------------------------------------
load("objects/PLM_dataframe.RData")

## ----vdj_plm-df, echo = FALSE-------------------------------------------------
knitr::kable(PLM_dataframe)

## ----plot_likelihood_example1, eval = FALSE-----------------------------------
#  # Plot the substitution rank
#  Af_plot_PLM(PLM_dataframe = PLM_dataframe,
#                           values = "substitution_rank",
#                           group_by = "sample_id")

## ----load_structure_data, echo=FALSE------------------------------------------
load("objects/VDJ_ova_structure.RData") #vdj
load("objects/VDJ_structure_AbAg.RData") #vdj_structure_AbAg
load("objects/AntibodyForests_OVA_structure_AbAg.RData") #af_structure_AbAg

## ----structure_example2, eval = F---------------------------------------------
#  #Biophysical properties of the heavy and light chain sequence of the antibodies are calculated
#  vdj_structure_AbAg <- VDJ_3d_properties(VDJ = vdj,
#                                     pdb.dir = "~/path/PDBS_superimposed/",
#                                     file.df = files,
#                                     properties = c("charge",
#                                                    "3di_germline",
#                                                    "hydrophobicity",
#                                                    "RMSD_germline",
#                                                    "pKa_shift",
#                                                    "free_energy",
#                                                    "pLDDT"),
#                                     chain = "whole.complex",
#                                     sequence.region = "binding.residues",
#                                     propka.dir = "~/path/Propka_output/",
#                                     germline.pdb = "~/path/PDBS_superimposed/germline_5_model_0.pdb",
#                                     foldseek.dir = "~/path/3di_sequences/")
#  
#  #Add the biophysical properties to the AntibodyForests object
#  af_structure_AbAg <- Af_add_node_feature(AntibodyForests_object = af,
#                                                   feature.df = vdj_structure_AbAg,
#                                                   feature.names = c("average_charge",
#                                                                     "3di_germline",
#                                                                     "average_hydrophobicity",
#                                                                     "RMSD_germline",
#                                                                     "average_pKa_shift",
#                                                                     "free_energy",
#                                                                     "average_pLDDT"))

## ----structure_example4_plot, eval = T, fig.width=5, fig.height=5-------------
Af_distance_scatterplot(af_structure_AbAg,
                        node.features = "free_energy",
                        correlation = "pearson",
                        color.by = "average_pLDDT",
                        color.by.numeric = T,
                        geom_smooth.method = "lm")

## ----plot_structure_example1, eval = F----------------------------------------
#  rmsd <- Af_edge_RMSD(af,
#               VDJ = vdj,
#               pdb.dir = "~/path/PDBS_superimposed/",
#               file.df = files,
#               sequence.region = "full.sequence",
#               chain = "HC+LC")
#  rmsd$plot

## ----write-AIRR-rearrangement, eval = FALSE-----------------------------------
#  # Append the IgBLAST annotations and alignment to the VDJ dataframe
#  VDJ_OVA <- VDJ_import_IgBLAST_annotations(VDJ = VDJ_OVA,
#                                            VDJ.directory = "../data/OVA_scRNA-seq_data/VDJ/",
#                                            method = "append")
#  
#  # Write the VDJ dataframe into an AIRR-formatted TSV file
#  VDJ_to_AIRR(VDJ = VDJ_OVA,
#              output.file = "../data/OVA_scRNA-seq_data/VDJ/airr_rearrangement.tsv")

## ----bulk_example, eval = FALSE-----------------------------------------------
#  # Integrate bulk RNA-seq data into the VDJ dataframe, if multiple clonotypes match the bulk sequence, the bulk transcript is added to all clonotypes (tie.resolvement = "all)
#  VDJ_OVA <- VDJ_integrate_bulk(sc.VDJ = VDJ_OVA,
#                     bulk.tsv = "path/to/bulk/OVA_bulkRNA.tsv",
#                     bulk.tsv.sequence.column="VDJ_sequence_nt_raw",
#                     bulk.tsv.sample.column="sample_id",
#                     bulk.tsv.barcode.column="barcode",
#                     bulk.tsv.isotype.column="isotype",
#                     organism="mouse",
#                     igblast.dir="/path/to/anaconda3/envs/igphyml/share/igblast",
#                     tie.resolvement = "all")
#  
#  #Create the lineage tree of the integrated VDJ dataframe
#  af <- Af_build(VDJ = VDJ_OVA,
#                 sequence.columns = "VDJ_sequence_aa_trimmed",
#                 germline.columns = "VDJ_germline_aa_trimmed",
#                 construction.method = "phylo.network.default",
#                 node.features = c("dataset"))
#  
#  #Plot lineage tree
#  Af_plot_tree(af,
#               sample = "S1",
#               clonotype = "clonotype3",
#               color.by = "dataset",
#               label.by = "none",
#               main.title = "OVA S1",
#               sub.title = "clonotype3")