## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(ulrb) library(cluster) library(dplyr) library(ggplot2) library(tidyr) # a vector with some colors qualitative_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ## ----------------------------------------------------------------------------- # Load raw OTU table from N-ICE data("nice_raw", package = "ulrb") # Change name of first column nice_clean <- rename(nice_raw, Taxonomy = "X.SampleID") # Select 16S rRNA amplicon sequencing samples selected_samples <- c("ERR2044662", "ERR2044663", "ERR2044664", "ERR2044665", "ERR2044666", "ERR2044667", "ERR2044668", "ERR2044669", "ERR2044670") # Add a column with phylogenetic units ID (OTU in this case) nice_clean <- mutate(nice_clean, OTU = paste0("OTU_", row_number())) # Select relevant collumns nice_clean <- select(nice_clean, selected_samples, OTU, Taxonomy) # Separate Taxonomy column into each taxonomic level nice_clean <- separate(nice_clean, Taxonomy, c("Domain","Kingdom","Phylum", "Class","Order","Family", "Genus","Species"), sep=";") # Remove Kingdom column, because it is not used for prokaryotes nice_clean <- select(nice_clean, -Kingdom) # Remove eukaryotes nice_clean <- filter(nice_clean, Domain != "sk__Eukaryota") # Remove unclassified OTUs at phylum level nice_clean <- filter(nice_clean, !is.na(Phylum)) # Simplify name nice <- nice_clean # Tidy data nice_tidy <- prepare_tidy_data(nice, sample_names = selected_samples, samples_in = "cols") ## ----------------------------------------------------------------------------- rb_default <- define_rb(nice_tidy) ## ----fig.width=5, fig.height=4------------------------------------------------ plot_ulrb_clustering(rb_default, sample_id = "ERR2044662", taxa_col = "OTU", log_scaled = TRUE) ## ----fig.width=8, fig.height=8------------------------------------------------ plot_ulrb_clustering(rb_default, taxa_col = "OTU", log_scaled = TRUE, plot_all = TRUE) ## ----------------------------------------------------------------------------- rb_k2 <- define_rb(nice_tidy, classification_vector = c("Rare", "Abundant")) ## ----fig.width = 5, fig.height= 4--------------------------------------------- plot_ulrb_clustering(rb_k2, taxa_col = "OTU", plot_all = TRUE, log_scaled = TRUE, colors = c("#009E73", "#F0E442")) ## ----fig.width=15------------------------------------------------------------- gridExtra::grid.arrange( plot_ulrb_clustering(rb_default, taxa_col = "OTU", log_scaled = TRUE, plot_all = TRUE), plot_ulrb_clustering(rb_k2, taxa_col = "OTU", plot_all = TRUE, log_scaled = TRUE, colors = c("#009E73", "#F0E442")), nrow = 1 ) ## ----------------------------------------------------------------------------- # rb_k4 <- define_rb(nice_tidy, classification_vector = c("very rare", "rare", "abundant", "very abundant")) # ## ----fig.width=15, fig.height=4----------------------------------------------- # One sample as example plot_ulrb(rb_k4, sample_id = "ERR2044662", taxa_col = "OTU", colors = c("#009E73", "#F0E442", "grey","#CC79A7"), log_scaled = TRUE) ## ----fig.width=15, fig.height=8----------------------------------------------- # all samples plot_ulrb(rb_k4, taxa_col = "OTU", colors = c("#009E73", "#F0E442", "grey","#CC79A7"), log_scaled = TRUE, plot_all = TRUE) ## ----------------------------------------------------------------------------- # rb_k5 <- define_rb(nice_tidy, classification_vector = c("very rare", "rare", "undetermined", "abundant", "very abundant")) ## ----fig.width=15, fig.height=4----------------------------------------------- # One sample as example plot_ulrb(rb_k5, sample_id = "ERR2044662", taxa_col = "OTU", colors = qualitative_colors[1:5], log_scaled = TRUE) ## ----fig.width=15, fig.height=8----------------------------------------------- # All samples plot_ulrb(rb_k5, taxa_col = "OTU", colors = qualitative_colors[1:5], log_scaled = TRUE, plot_all = TRUE) ## ----------------------------------------------------------------------------- # rb_k1 <- define_rb(nice_tidy, classification_vector = c("rare")) ## ----fig.width=5, fig.height=4------------------------------------------------ plot_ulrb_clustering(rb_k1, taxa_col = "OTU", colors = "green4", plot_all = TRUE, log_scaled = TRUE) ## ----------------------------------------------------------------------------- rb_sample1 <- nice_tidy %>% filter(Sample == "ERR2044662") # Calculate maximum k max_k_sample1 <- rb_sample1 %>% pull(Abundance) %>% unique() %>% length() # max_k_sample1 # Improvise a classification vector for maximum k # that is just any vector with the same length classification_vector_max_k_sample1 <- seq_along(1:max_k_sample1) # rb_sample1_max_k <- define_rb(rb_sample1, classification_vector = classification_vector_max_k_sample1) # rb_sample1_max_k %>% select(OTU, Classification, Abundance) %>% head(10) ## ----------------------------------------------------------------------------- suggest_k(nice_tidy, detailed = TRUE) ## ----------------------------------------------------------------------------- suggest_k(nice_tidy, detailed = TRUE, range = 10:20) ## ----------------------------------------------------------------------------- ## One sample # To get values check_avgSil(nice_tidy, sample_id = selected_samples[1]) # To plot results check_avgSil(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE) ## ----------------------------------------------------------------------------- ## Davie-Boulding index # To get values check_DB(nice_tidy, sample_id = selected_samples[1]) # To plot results check_DB(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE) ## Calinski-Harabasz index # To get values check_CH(nice_tidy, sample_id = selected_samples[1]) # To plot results check_CH(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE) ## ----------------------------------------------------------------------------- evaluate_sample_k(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE) ## ----------------------------------------------------------------------------- ## To get values evaluate_k(nice_tidy) ## To plot evaluate_k(nice_tidy, with_plot = TRUE) ## ----------------------------------------------------------------------------- # default option with average Silhouette score suggest_k(nice_tidy) # best k for Davies-Bouldin suggest_k(nice_tidy, index = "Davies-Bouldin") # best k for Calinski-Harabasz suggest_k(nice_tidy, index = "Calinski-Harabasz") ## ----------------------------------------------------------------------------- automatic_classification <- define_rb(nice_tidy, automatic = TRUE) # Plot automatic result plot_ulrb(automatic_classification, taxa_col = "OTU", plot_all = TRUE) ## ----------------------------------------------------------------------------- more_complex_automatic_classification <- define_rb(nice_tidy, automatic = TRUE, index = "Calinski-Harabasz", range = 4:6) ## ----------------------------------------------------------------------------- # Plot automatic result plot_ulrb(more_complex_automatic_classification, plot_all = TRUE, taxa_col = "OTU", colors = qualitative_colors[1:5], log_scaled = TRUE) ## ----------------------------------------------------------------------------- # Start by deciding the maximum range across the entire dataset max_k <- nice_tidy %>% filter(Abundance > 0, !is.na(Abundance)) %>% group_by(Sample) %>% summarise(topK = length(unique(Abundance))) %>% ungroup() %>% pull(topK) %>% min() # print maximum number of clusters allowed for all samples in the N-ICE dataset max_k ## ----------------------------------------------------------------------------- evaluate_k(nice_tidy, with_plot = TRUE, range = 2:max_k)