## ----setup, echo = FALSE, eval = TRUE, include = FALSE------------------------ # Set global options knitr::opts_chunk$set(eval = TRUE, include = TRUE, echo = TRUE, message = FALSE, warning = FALSE, error = FALSE, dpi = 72) # Define custom hooks for handling errors, warnings, and messages during knitting knitr::knit_hooks$set( error = function(x, options){ paste('\n\n<div class="alert alert-danger">', gsub('##', '\n', gsub('^##\ Error', '**Error**', x)), '</div>', sep = '\n')}, warning = function(x, options){ paste('\n\n<div class="alert alert-warning">', gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)), '</div>', sep = '\n')}, message = function(x, options){ paste('\n\n<div class="alert alert-info">', gsub('##', '\n', x), '</div>', sep = '\n')}) #Load AntibodyForests functions devtools::load_all(".") ## ----eval = FALSE------------------------------------------------------------- # #Install from CRAN # install.packages("Platypus") # install.packages("AntibodyForests") ## ----eval = FALSE------------------------------------------------------------- # #Load the libraries # library(Platypus) # library(AntibodyForests) ## ----installation, echo = FALSE, eval = FALSE--------------------------------- # # List of CRAN packages required to run all functions # CRAN_packages <- c("Platypus", "alakazam", "ape", "base", # "base64enc", "BiocManager", "dplyr", # "grDevices", "gtools", "igraph", # "jsonlite", "knitr", "parallel", # "phangorn", "scales", "stats", # "stringdist", "stringr", "utils") # # # List of Bioconductor packages required to run all functions # Bioconductor_packages <- c("Biostrings", "msa") # # # Iterate through packages # for(package in c(CRAN_packages, Bioconductor_packages)){ # # Check if the current packages is already installed # if(!requireNamespace(package, quietly = TRUE)){ # # Install CRAN packages if not installed # if(package %in% CRAN_packages){install.packages(package)} # # Install Bioconductor packages if not installed # if(package %in% Bioconductor_packages){BiocManager::install(package)} # } # } # ## ----quick-start1, eval = FALSE----------------------------------------------- # # Import 10x Genomics output files into VDJ dataframe and only keep cells with one VDJ and one VJ transcript # # For each clone, trim germline sequence and replace CDR3 region in germline with most-frequently observed CDR3 sequence # VDJ_OVA <- Platypus::VDJ_build(VDJ.directory = "10x_output/VDJ/", # remove.divergent.cells = TRUE, # complete.cells.only = TRUE, # trim.germlines = TRUE) # # # Build lineage trees for all clones present in the VDJ dataframe with the default algorithm # AntibodyForests_OVA_default <- Af_build(VDJ = VDJ_OVA, # construction.method = "phylo.network.default") ## ----quick-start2, eval = F--------------------------------------------------- # # Plot one of the lineage trees, constructed with the `phylo.network.default' method # Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_default, # sample = "S1", # clonotype = "clonotype2") ## ----quick-start3, eval = F--------------------------------------------------- # # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree # out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default, # min.nodes = 5, # distance.method = "euclidean", # distance.metrics = c("mean.depth", "sackin.index", "spectral.density"), # clustering.method = "mediods", # visualization.methods = "PCA", # plot.label = T) # # # Analyze the difference between the clusters based on the calculated sackin.index # plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default, # clusters = out$clustering, # metrics = c("spectral.density"), # min.nodes = 5, # significance = T) # ## ----quick-start4, eval=FALSE------------------------------------------------- # # Plot the PCA colored on the assigned clusters # out$plots$PCA_clusters ## ----quick-start5, eval=FALSE------------------------------------------------- # # Plot the different number of modalities between the clusters # plots$modalities ## ----vdj_build-example-3-code, eval = FALSE----------------------------------- # # Read in data, keep only complete, non-divergent cells and trim germline sequences # VDJ_OVA <- Platypus::VDJ_build(VDJ.directory = "10x_output/VDJ/", # remove.divergent.cells = TRUE, # complete.cells.only = TRUE, # trim.germlines = TRUE, # fill.germline.CDR3 = TRUE) ## ----vdj_build-example-3-message, echo = FALSE, results = "asis"-------------- # Print number of filtered cells/barcodes warning("Please be aware of the number of cells excluded, when 'remove.divergent.cells' or 'complete.cells.only' is set to TRUE:\n") print(knitr::kable(data.frame(divergent.cells = c(360, 295, 79, 183 ,526), incomplete.cells = c(708, 503, 387, 380, 1113), row.names = c("S1", "S2", "S3", "S4", "S5")))) ## ----vdj_build-example-3-df, echo = FALSE------------------------------------- # Load the example VDJdataframe load("objects/VDJ_OVA_dummy_3.RData") knitr::kable(head(VDJ_OVA_dummy_3[,c("sample_id", "barcode", "isotype", "clonotype_id", "clonotype_frequency", "VDJ_vgene", "VDJ_jgene")], 10)) ## ----AntibodyForests-example-default, eval = FALSE---------------------------- # # Infer lineage trees for all clones using the 'phylo.network.default' construction method and using all default parameter settings # AntibodyForests_OVA_default <- Af_build(VDJ = VDJ_OVA, # sequence.columns = c("VDJ_sequence_nt_trimmed", "VJ_sequence_nt_trimmed"), # germline.columns = c("VDJ_germline_nt_trimmed", "VJ_germline_nt_trimmed"), # concatenate.sequences = FALSE, # node.features = "isotype", # construction.method = "phylo.network.default", # string.dist.metric = "lv", # resolve.ties = c("max.expansion", "close.germline.dist", "close.germline.edges", "random"), # parallel = TRUE) ## ----AntibodyForests-example-ML, eval = FALSE--------------------------------- # # Infer lineage trees for all clones using the 'phylo.tree.ml' construction method and using all default parameter settings # AntibodyForests_OVA_ml <- Af_build(VDJ = VDJ_OVA, # sequence.columns = c("VDJ_sequence_nt_trimmed", "VJ_sequence_nt_trimmed"), # germline.columns = c("VDJ_germline_nt_trimmed", "VJ_germline_nt_trimmed"), # concatenate.sequences = FALSE, # construction.method = "phylo.tree.ml", # dna.model = "all", # remove.internal.nodes = "connect.to.parent", # parallel = TRUE) ## ----AntibodyForests-example-IgPhyML, eval = FALSE---------------------------- # # Infer lineage trees for all clones using the 'phylo.tree.IgPhyML' construction method and using all default parameter settings # AntibodyForests_OVA_IgPhyML <- Af_build(VDJ = VDJ_OVA, # sequence.columns = "VDJ_sequence_nt_trimmed", # germline.columns = "VDJ_germline_nt_trimmed", # construction.method = "phylo.tree.IgPhyML", # IgPhyML.output.file = "data/OVA_scRNA-seq_data/VDJ/airr_rearrangement_igphyml-pass.tab", # remove.internal.nodes = "connect.to.parent", # parallel = TRUE) ## ----AntibodyForests_plot_ML_with_IN_example, eval = FALSE-------------------- # # Plot lineage tree, constructed with the `phylo.tree.ml' method, without internal nodes # Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_ml, # sample = "S1", # clonotype = "clonotype8", # show.inner.nodes = TRUE, # label.by = "name", # color.by = "isotype", # node.size = "expansion", # main.title = "Lineage tree - ML", # sub.title = "OVA - S1 - clonotype8") ## ----AntibodyForests_plot_ML_without_IN_example, eval = FALSE----------------- # # Plot lineage tree, constructed with the `phylo.tree.ml' method, without internal nodes # Af_plot_tree(AntibodyForests_object = AntibodyForests_OVA_ml, # sample = "S1", # clonotype = "clonotype8", # show.inner.nodes = FALSE, # label.by = "name", # color.by = "isotype", # node.size = "expansion", # main.title = "Lineage tree - ML", # sub.title = "OVA - S1 - clonotype8") ## ----Af_sync_nodes, eval= FALSE----------------------------------------------- # # Synchronize the nodes of the different AntibodyForests objects # AntibodyForests_OVA_IgPhyML <- Af_sync_nodes(reference = AntibodyForests_OVA_default, # subject = AntibodyForests_OVA_IgPhyML) # AntibodyForests_OVA_mst <- Af_sync_nodes(reference = AntibodyForests_OVA_default, # subject = AntibodyForests_OVA_mst) # AntibodyForests_OVA_nj <- Af_sync_nodes(reference = AntibodyForests_OVA_default, # subject = AntibodyForests_OVA_nj) # AntibodyForests_OVA_mp <- Af_sync_nodes(reference = AntibodyForests_OVA_default, # subject = AntibodyForests_OVA_mp) # AntibodyForests_OVA_ml <- Af_sync_nodes(reference = AntibodyForests_OVA_default, # subject = AntibodyForests_OVA_ml) ## ----compare_methods_example2, eval = FALSE----------------------------------- # # Calculate the euclidean distance between different trees based on the GBLD # out <- Af_compare_methods(input = list("Default" = AntibodyForests_OVA_default, # "MST" = AntibodyForests_OVA_mst, # "NJ" = AntibodyForests_OVA_nj, # "MP" = AntibodyForests_OVA_mp, # "ML" = AntibodyForests_OVA_ml, # "IgPhyML" = AntibodyForests_OVA_IgPhyML), # min.nodes = 10, # distance.method = "GBLD", # visualization.methods = "heatmap", # include.average = T) # # Plot the average heatmap # out$average$Heatmap ## ----metric_dataframe1, eval = FALSE------------------------------------------ # # Calculate the metric dataframe for the AntibodyForests object # metric_df <- Af_metrics(input = AntibodyForests_OVA_default, # min.nodes = 10, # metrics = c("mean.depth", "sackin.index", "spectral.density")) # ## ----metric_dataframe2-------------------------------------------------------- load("objects/metric_df.RData") ## ----metric_dataframe3, echo=FALSE-------------------------------------------- # Print the dataframe DT::datatable(data = metric_df, rownames = TRUE, options = list(scrollX = TRUE)) ## ----compare_distance_isotype, eval=FALSE------------------------------------- # # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree # Af_distance_boxplot(AntibodyForests_object = AntibodyForests_OVA_default, # node.feature = "isotype", # distance = "node.depth", # min.nodes = 8, # groups = c("IgG1", "IgA"), # text.size = 16, # unconnected = T, # significance = T) ## ----compare_within_repertoire_example1, eval = FALSE------------------------- # # Cluster the trees in the repertoire based on the euclidean distance between various metrics of each tree # out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default, # min.nodes = 8, # distance.method = "euclidean", # distance.metrics = c("mean.depth", "sackin.index", "spectral.density"), # clustering.method = "mediods", # visualization.methods = "PCA") # # Plot the PCA colored on the assigned clusters # out$plots$PCA_clusters ## ----cluster_metrics_example1, eval = FALSE----------------------------------- # # Analyze the difference between the clusters based on the calculated sackin.index # plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default, # clusters = out$clustering, # metrics = c("sackin.index", "spectral.density"), # min.nodes = 8, # significance = T) # # plots$sackin.index # plots$modalities ## ----cluster_node_features_example1, eval = FALSE----------------------------- # # Analyze the difference between the clusters based on the calculated sackin.index # plots <- Af_cluster_node_features(input = AntibodyForests_OVA_default, # clusters = out$clustering, # features = "isotype", # fill = "max", # significance = T) # # plots$isotype ## ----compare_within_repertoire_example2, eval = FALSE------------------------- # # Cluster the trees in the repertoire based on the Jensen-Shannon divergence between the Spectral Density profiles # out <- Af_compare_within_repertoires(input = AntibodyForests_OVA_default, # min.nodes = 8, # distance.method = "jensen-shannon", # clustering.method = "mediods", # visualization.methods = "heatmap") # # Plot the heatmap # out$plots$heatmap_clusters ## ----cluster_metrics_example2, eval=FALSE------------------------------------- # # Analyze the difference between the clusters based on the calculated sackin.index # plots <- Af_cluster_metrics(input = AntibodyForests_OVA_default, # clusters = out$clustering, # metrics = "spectral.density", # min.nodes = 8, # significance = T) # # plots$spectral.asymmetry # plots$modalities ## ----compare_across_repertoire_example1, eval=FALSE--------------------------- # #Calculate the difference between the two groups of repertoires # boxplots <- Af_compare_across_repertoires(list("Blood" = AntibodyForests_example_blood, # "Lymph Nodes" = AntibodyForests_example_LN), # metrics = c("betweenness", "mean.depth"), significance = T) # # #Plot the boxplots # boxplots$betweenness # boxplots$mean.depth ## ----add_pseudolikelihood_example, eval=FALSE--------------------------------- # # Add pseudolikelihood as node feature to the AntibodyForests object # AntibodyForests_OVA_default <- Af_add_node_feature(AntibodyForests_object = AntibodyForests_OVA_default, # feature.df = VDJ_OVA, # feature.names = "pseudolikelihood") ## ----plot_pseudolikelihood_example1, eval=FALSE------------------------------- # # Plot pseudolikelihood against distance to the germline # Af_distance_scatterplot(AntibodyForests_object = AntibodyForests_OVA_default, # node.features = "pseudolikelihood", # distance = "edge.length", # min.nodes = 5, # correlation = "pearson", # color.by = "sample") ## ----get_sequences_probability_example, eval=FALSE---------------------------- # #Create a dataframe of the sequences with a sequence ID containing the sample, clonotype, and node number # df <- Af_get_sequences(AntibodyForests_object = AntibodyForests_OVA_default, # sequence.name = "VDJ_sequence_aa_trimmed", # min.edges = 1) # #Save the data frame as CSV to serve as input for the PLM-pipeline # write.csv(df, file = "/directory/PLM_input.csv", row.names = FALSE) ## ----probability_matrix_example, eval=FALSE----------------------------------- # #Create a PLM dataframe # PLM_dataframe <- Af_PLM_dataframe(AntibodyForests_object = AntibodyForests_OVA_default, # sequence.name = "VDJ_sequence_aa_trimmed", # path_to_probabilities = "/directory/ProbabilityMatrix") ## ----load_plm_dataframe, echo=FALSE------------------------------------------- load("objects/PLM_dataframe.RData") ## ----vdj_plm-df, echo = FALSE------------------------------------------------- knitr::kable(PLM_dataframe) ## ----plot_likelihood_example1, eval = FALSE----------------------------------- # # Plot the substitution rank # Af_plot_PLM(PLM_dataframe = PLM_dataframe, # values = "substitution_rank", # group_by = "sample_id") ## ----load_structure_data, echo=FALSE------------------------------------------ load("objects/VDJ_ova_structure.RData") #vdj load("objects/VDJ_structure_AbAg.RData") #vdj_structure_AbAg load("objects/AntibodyForests_OVA_structure_AbAg.RData") #af_structure_AbAg ## ----structure_example2, eval = F--------------------------------------------- # #Biophysical properties of the heavy and light chain sequence of the antibodies are calculated # vdj_structure_AbAg <- VDJ_3d_properties(VDJ = vdj, # pdb.dir = "~/path/PDBS_superimposed/", # file.df = files, # properties = c("charge", # "3di_germline", # "hydrophobicity", # "RMSD_germline", # "pKa_shift", # "free_energy", # "pLDDT"), # chain = "whole.complex", # sequence.region = "binding.residues", # propka.dir = "~/path/Propka_output/", # germline.pdb = "~/path/PDBS_superimposed/germline_5_model_0.pdb", # foldseek.dir = "~/path/3di_sequences/") # # #Add the biophysical properties to the AntibodyForests object # af_structure_AbAg <- Af_add_node_feature(AntibodyForests_object = af, # feature.df = vdj_structure_AbAg, # feature.names = c("average_charge", # "3di_germline", # "average_hydrophobicity", # "RMSD_germline", # "average_pKa_shift", # "free_energy", # "average_pLDDT")) ## ----structure_example4_plot, eval = T, fig.width=5, fig.height=5------------- Af_distance_scatterplot(af_structure_AbAg, node.features = "free_energy", correlation = "pearson", color.by = "average_pLDDT", color.by.numeric = T, geom_smooth.method = "lm") ## ----plot_structure_example1, eval = F---------------------------------------- # rmsd <- Af_edge_RMSD(af, # VDJ = vdj, # pdb.dir = "~/path/PDBS_superimposed/", # file.df = files, # sequence.region = "full.sequence", # chain = "HC+LC") # rmsd$plot ## ----write-AIRR-rearrangement, eval = FALSE----------------------------------- # # Append the IgBLAST annotations and alignment to the VDJ dataframe # VDJ_OVA <- VDJ_import_IgBLAST_annotations(VDJ = VDJ_OVA, # VDJ.directory = "../data/OVA_scRNA-seq_data/VDJ/", # method = "append") # # # Write the VDJ dataframe into an AIRR-formatted TSV file # VDJ_to_AIRR(VDJ = VDJ_OVA, # output.file = "../data/OVA_scRNA-seq_data/VDJ/airr_rearrangement.tsv") ## ----bulk_example, eval = FALSE----------------------------------------------- # # Integrate bulk RNA-seq data into the VDJ dataframe, if multiple clonotypes match the bulk sequence, the bulk transcript is added to all clonotypes (tie.resolvement = "all) # VDJ_OVA <- VDJ_integrate_bulk(sc.VDJ = VDJ_OVA, # bulk.tsv = "path/to/bulk/OVA_bulkRNA.tsv", # bulk.tsv.sequence.column="VDJ_sequence_nt_raw", # bulk.tsv.sample.column="sample_id", # bulk.tsv.barcode.column="barcode", # bulk.tsv.isotype.column="isotype", # organism="mouse", # igblast.dir="/path/to/anaconda3/envs/igphyml/share/igblast", # tie.resolvement = "all") # # #Create the lineage tree of the integrated VDJ dataframe # af <- Af_build(VDJ = VDJ_OVA, # sequence.columns = "VDJ_sequence_aa_trimmed", # germline.columns = "VDJ_germline_aa_trimmed", # construction.method = "phylo.network.default", # node.features = c("dataset")) # # #Plot lineage tree # Af_plot_tree(af, # sample = "S1", # clonotype = "clonotype3", # color.by = "dataset", # label.by = "none", # main.title = "OVA S1", # sub.title = "clonotype3")