## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) knitr::opts_chunk$set(warning = FALSE, echo = TRUE, fig.width = 7) custom_palette <- c("#8dae25", "#17365c") custom_palette <- colorRampPalette( custom_palette, bias = 1, interpolate = "linear" ) library(knitr) library(kableExtra) ## ----setup-------------------------------------------------------------------- library(clayringsmiletus) data("StackRMiletus") ## ----display_table_1, echo = FALSE, warning=FALSE, message=FALSE-------------- library(dplyr) StackRMiletus %>% filter(!is.na(Inv)) %>% arrange(Inv) %>% kable("html") %>% kable_styling( font_size = 9, full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed", "responsive")) ## ----density_prep, echo = TRUE, message= FALSE-------------------------------- library(reshape2) library(ggridges) library(ggplot2) StackRMiletus %>% transmute( thickness_mm = thickness * 10, height_mm = height * 10, width_cm = width, min_DM_cm = min_DM, max_DM_cm = max_DM) %>% melt() %>% ggplot(aes(x = value, y = variable, fill = variable)) + geom_density_ridges(scale = 4, bw = "SJ", alpha = 0.8) + scale_fill_manual(values = custom_palette(5)) + theme(legend.position = "none", panel.background = element_blank(), axis.title = element_blank()) + labs(title = "Numerical variables across all rings") ## ----hdbscan------------------------------------------------------------------ library(dbscan) hdbcluster <- StackRMiletus %>% select(min_DM, max_DM) %>% hdbscan(minPts = 5) ## ----add_clus_to_df----------------------------------------------------------- StackRMiletus$HDBScan_Cluster <- as.factor(hdbcluster$cluster) StackRMiletus$membership_prob <- hdbcluster$membership_prob ## ----hdbscan_viz_1, fig.width=6, fig.height = 6, echo=FALSE------------------- ggplot(StackRMiletus, aes(x = max_DM, y = min_DM, color = HDBScan_Cluster, alpha = membership_prob)) + geom_point(aes(shape = StackRMiletus$shape), position = position_jitter(width = 0.03, height = 0.03), size = 3) + scale_shape_discrete(name = "Profile") + scale_alpha_continuous(name = "Membership\nProbability") + scale_color_discrete(breaks = c("0", "1", "2", "3"), labels = c("noise", "A", "B", "C"), name = "Cluster\n(HDBScan)") + xlab(label = "maximum / outer diameter in cm") + ylab(label = "minimum / inner diameter in cm") + scale_y_continuous(breaks = seq(0, 20, 0.5)) + scale_x_continuous(breaks = seq(0, 20, 0.5)) + coord_fixed(ratio = 1, xlim = NULL, ylim = NULL, expand = TRUE) + theme(panel.background = element_blank(), panel.grid.major = element_line(colour = "grey", linewidth = 0.2, linetype = 2), axis.text.x = element_text(angle = 45)) ## ----record_summmary_statistics, echo = TRUE---------------------------------- datastructure <- data.frame(matrix(ncol = 5, nrow = 4)) colnames(datastructure) <- c("height", "min_DM", "max_DM", "width", "thickness") rownames(datastructure) <- c("range", "mean", "sd", "cv") for (col in colnames(datastructure)) { range <- range(StackRMiletus[, col]) sd <- sd(StackRMiletus[, col]) mean <- mean(StackRMiletus[, col]) datastructure["range", col] <- range[2] - range[1] datastructure["mean", col] <- mean datastructure["sd", col] <- sd datastructure["cv", col] <- sd / mean * 100 } ## ----table_summmary_statistics, echo = FALSE---------------------------------- datastructure %>% kable("html", digits = 3) %>% kable_styling(font_size = 9, full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed", "responsive")) ## ----setup_for_comparison, echo = TRUE---------------------------------------- tests_compare <- data.frame(matrix(ncol = 5, nrow = 1000)) colnames(tests_compare) <- c("N", "n_cluster", "avg_sil_width", "avg_sil_width_without_0", "n_noise") ## ----df_for_comparison, echo = TRUE------------------------------------------- library(cluster) tests_dummy <- lapply(seq_len(3000), function(x) { testsample <- data.frame(matrix(nrow = 67, ncol = 2)) colnames(testsample) <- c("min_DM", "max_DM") testsample[, "min_DM"] <- rnorm( n = 67, mean = datastructure$min_DM[2], sd = datastructure$min_DM[3] ) width <- rnorm( n = 67, mean = datastructure$width[2], sd = datastructure$width[3] ) testsample[, "max_DM"] <- (width * 2) + testsample[, "min_DM"] test_dist <- dist(testsample, method = "euclidean") test_hdbcluster <- hdbscan(testsample, minPts = 5) sil_hdbscan <- silhouette(test_hdbcluster$cluster, test_dist, do.clus.stat = T, do.n.k = T) x <- list( N = nrow(testsample), n_cluster = length(unique(test_hdbcluster$cluster)) - 1, n_noise = length(which(test_hdbcluster$cluster == 0)), avg_sil_width = NA, avg_sil_width_without_0 = NA ) if (x$n_cluster >= 1) { mat <- as.data.frame(sil_hdbscan[1:67, 1:3]) x$avg_sil_width <- mean(mat$sil_width) mat <- mat[-which(mat$cluster == 0), ] x$avg_sil_width_without_0 <- mean(mat$sil_width) } if (any(is.na(x))) { return(NA) } else { return(x) } }) tests_dummy <- tests_dummy[!is.na(tests_dummy)] tests_compare <- tests_dummy[sample(seq_along(tests_dummy), replace = FALSE, size = nrow(tests_compare))] tests_compare <- bind_rows(tests_compare) ## ----silhouette_original_data, echo = TRUE------------------------------------ dist <- StackRMiletus %>% select(min_DM, max_DM) %>% dist(method = "euclidean") sil_hdbscan <- silhouette(hdbcluster$cluster, dist, do.clus.stat = T, do.n.k = T) ## ----echo = FALSE------------------------------------------------------------- plot(sil_hdbscan, border = NA) abline(v = c(0.26, 0.51, 0.71), col = c("red", "blue", "green")) ## ----------------------------------------------------------------------------- miletus_data_sil <- data.frame(matrix(ncol = 5, nrow = 1)) colnames(miletus_data_sil) <- colnames(tests_compare) miletus_data_sil$N[1] <- nrow(StackRMiletus) miletus_data_sil$n_cluster[1] <- length(unique(hdbcluster$cluster)) - 1 miletus_data_sil$n_noise[1] <- length(which(hdbcluster$cluster == 0)) mat <- as.data.frame(sil_hdbscan[1:67, 1:3]) miletus_data_sil$avg_sil_width[1] <- mean(mat$sil_width) mat <- mat[-which(mat$cluster == 0), ] miletus_data_sil$avg_sil_width_without_0[1] <- mean(mat$sil_width) ## ----density_plot_compare_testsamples, echo = FALSE, warning = FALSE, message=FALSE---- p1 <- ggplot(data = tests_compare) + geom_density(aes(x = n_cluster, y = after_stat(density)), fill = 'grey') + labs(title = "Number of clusters") + theme(panel.background = element_blank(), panel.grid = element_blank()) + geom_vline(xintercept = miletus_data_sil$n_cluster, col = "red") p2 <- ggplot(data = tests_compare) + geom_density(aes(x = n_noise, y = after_stat(density)), fill = 'grey') + labs(title = "Number of points in noise-cluster") + theme(panel.background = element_blank(), panel.grid = element_blank()) + geom_vline(xintercept = miletus_data_sil$n_noise, col = "red") p3 <- ggplot(data = tests_compare) + geom_density(aes(x = avg_sil_width, y = after_stat(density)), fill = 'grey') + labs(title = "Average Silhouette width including noise-cluster") + theme(panel.background = element_blank(), panel.grid = element_blank()) + xlim(0, 1) + geom_vline(xintercept = miletus_data_sil$avg_sil_width, col = "red") p4 <- ggplot(data = tests_compare) + geom_density(aes(x = avg_sil_width_without_0, y = after_stat(density)), fill = 'grey') + labs(title = "Average Silhouette width without noise-cluster") + theme(panel.background = element_blank(), panel.grid = element_blank()) + xlim(0, 1) + geom_vline(xintercept = miletus_data_sil$avg_sil_width_without_0, col = "red") library(gridExtra) grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2, widths = c(2, 2)) ## ----echo = FALSE------------------------------------------------------------- StackRMiletus %>% transmute(HDBScan_Cluster, height = scale(height), min_DM = scale(min_DM), max_DM = scale(max_DM), width = scale(width), thickness = scale(thickness)) %>% filter(HDBScan_Cluster != 0) %>% melt(id.vars = "HDBScan_Cluster") %>% ggplot() + geom_boxplot(aes(y = value, x = HDBScan_Cluster, fill = HDBScan_Cluster)) + facet_wrap( ~ variable, ncol = 5) + theme(legend.position = "none", axis.title = element_blank()) ## ----echo = FALSE------------------------------------------------------------- StackRMiletus %>% select(HDBScan_Cluster, height, min_DM, max_DM, width, thickness) %>% filter(HDBScan_Cluster != 0) %>% melt(id.vars = "HDBScan_Cluster") %>% ggplot() + geom_boxplot(aes(y = value, x = HDBScan_Cluster, fill = HDBScan_Cluster)) + facet_wrap( ~ variable, ncol = 5) + theme(legend.position = "none", axis.title = element_blank())