## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
knitr::opts_chunk$set(warning = FALSE, echo = TRUE, fig.width = 7)
custom_palette <- c("#8dae25", "#17365c")
custom_palette <- colorRampPalette(
  custom_palette, 
  bias = 1, 
  interpolate = "linear"
)

library(knitr)
library(kableExtra)

## ----setup--------------------------------------------------------------------
library(clayringsmiletus)
data("StackRMiletus")

## ----display_table_1, echo = FALSE, warning=FALSE, message=FALSE--------------
library(dplyr)
StackRMiletus %>%
  filter(!is.na(Inv)) %>% 
  arrange(Inv) %>%
  kable("html") %>%
  kable_styling(
    font_size = 9, 
    full_width = FALSE, 
    position = "left", 
    bootstrap_options = c("striped", "hover", "condensed", "responsive"))

## ----density_prep, echo = TRUE, message= FALSE--------------------------------
library(reshape2)
library(ggridges)
library(ggplot2)

StackRMiletus %>%
  transmute(
    thickness_mm = thickness * 10,
    height_mm = height * 10, 
    width_cm = width, 
    min_DM_cm = min_DM, 
    max_DM_cm = max_DM) %>%
  melt() %>%
  ggplot(aes(x = value, y = variable, fill = variable)) + 
  geom_density_ridges(scale = 4, bw = "SJ", alpha = 0.8) +
  scale_fill_manual(values = custom_palette(5)) +
  theme(legend.position = "none", 
        panel.background = element_blank(), 
        axis.title = element_blank()) +
  labs(title = "Numerical variables across all rings")

## ----hdbscan------------------------------------------------------------------
library(dbscan)
hdbcluster <- StackRMiletus %>%
  select(min_DM, max_DM) %>%
  hdbscan(minPts = 5)

## ----add_clus_to_df-----------------------------------------------------------
StackRMiletus$HDBScan_Cluster <- as.factor(hdbcluster$cluster)
StackRMiletus$membership_prob <- hdbcluster$membership_prob

## ----hdbscan_viz_1, fig.width=6, fig.height = 6, echo=FALSE-------------------
ggplot(StackRMiletus, aes(x = max_DM, y = min_DM, 
                          color = HDBScan_Cluster, 
                          alpha = membership_prob)) +
  geom_point(aes(shape = StackRMiletus$shape), 
             position = position_jitter(width = 0.03, height = 0.03), 
             size = 3) +
  scale_shape_discrete(name = "Profile") +
  scale_alpha_continuous(name = "Membership\nProbability") +
  scale_color_discrete(breaks = c("0", "1", "2", "3"), 
                       labels = c("noise", "A", "B", "C"),
                       name = "Cluster\n(HDBScan)") +
  xlab(label = "maximum / outer diameter in cm") +
  ylab(label = "minimum / inner diameter in cm") +
  scale_y_continuous(breaks = seq(0, 20, 0.5)) +
  scale_x_continuous(breaks = seq(0, 20, 0.5)) +
  coord_fixed(ratio = 1, xlim = NULL, ylim = NULL, expand = TRUE) +
  theme(panel.background = element_blank(), 
        panel.grid.major = element_line(colour = "grey", 
                                        linewidth = 0.2, 
                                        linetype = 2),
        axis.text.x = element_text(angle = 45))

## ----record_summmary_statistics, echo = TRUE----------------------------------
datastructure <- data.frame(matrix(ncol = 5, nrow = 4))
colnames(datastructure) <- c("height", "min_DM", "max_DM", 
                             "width", "thickness") 
rownames(datastructure) <- c("range", "mean", "sd", "cv")

for (col in colnames(datastructure)) {
  range <- range(StackRMiletus[, col])
  sd <- sd(StackRMiletus[, col])
  mean <- mean(StackRMiletus[, col])
  datastructure["range", col] <- range[2] - range[1]
  datastructure["mean", col] <- mean
  datastructure["sd", col] <- sd
  datastructure["cv", col] <- sd / mean * 100
}

## ----table_summmary_statistics, echo = FALSE----------------------------------
datastructure %>%
  kable("html", digits = 3) %>%
  kable_styling(font_size = 9, full_width = FALSE, position = "left", 
                bootstrap_options = c("striped", "hover", 
                                      "condensed", "responsive"))

## ----setup_for_comparison, echo = TRUE----------------------------------------
tests_compare <- data.frame(matrix(ncol = 5, nrow = 1000))
colnames(tests_compare) <- c("N", "n_cluster", "avg_sil_width", 
                             "avg_sil_width_without_0", "n_noise")

## ----df_for_comparison, echo = TRUE-------------------------------------------
library(cluster)
tests_dummy <- lapply(seq_len(3000), function(x) {
  testsample <- data.frame(matrix(nrow = 67, 
                                  ncol = 2))
  colnames(testsample) <- c("min_DM", "max_DM")
  
  testsample[, "min_DM"] <- rnorm(
    n = 67, 
    mean = datastructure$min_DM[2], 
    sd = datastructure$min_DM[3]
  )
  width <- rnorm(
    n = 67, 
    mean = datastructure$width[2], 
    sd = datastructure$width[3]
  )
  
  testsample[, "max_DM"] <- (width * 2) + testsample[, "min_DM"]
  
  test_dist <- dist(testsample, method = "euclidean")
  test_hdbcluster <- hdbscan(testsample, minPts = 5)
  sil_hdbscan <- silhouette(test_hdbcluster$cluster, 
                            test_dist, 
                            do.clus.stat = T, 
                            do.n.k = T)
  
  x <- list(
    N = nrow(testsample),
    n_cluster = length(unique(test_hdbcluster$cluster)) - 1,
    n_noise = length(which(test_hdbcluster$cluster == 0)),
    avg_sil_width = NA,
    avg_sil_width_without_0 = NA
  )
  
  if (x$n_cluster >= 1) {
    mat <- as.data.frame(sil_hdbscan[1:67, 1:3])
    x$avg_sil_width <- mean(mat$sil_width)
    
    mat <- mat[-which(mat$cluster == 0), ]
    x$avg_sil_width_without_0 <- mean(mat$sil_width)
  } 
  
  if (any(is.na(x))) {
    return(NA)
  } else {
    return(x)
  }
})  

tests_dummy <- tests_dummy[!is.na(tests_dummy)]

tests_compare <- tests_dummy[sample(seq_along(tests_dummy), 
                                    replace = FALSE, 
                                    size = nrow(tests_compare))]
tests_compare <- bind_rows(tests_compare)

## ----silhouette_original_data, echo = TRUE------------------------------------
dist <- StackRMiletus %>%
  select(min_DM, max_DM) %>%
  dist(method = "euclidean")
sil_hdbscan <- silhouette(hdbcluster$cluster, 
                          dist, 
                          do.clus.stat = T, 
                          do.n.k = T)

## ----echo = FALSE-------------------------------------------------------------
plot(sil_hdbscan, border = NA)
abline(v = c(0.26, 0.51, 0.71), col = c("red", "blue", "green"))

## -----------------------------------------------------------------------------
miletus_data_sil <- data.frame(matrix(ncol = 5, nrow = 1))
colnames(miletus_data_sil) <- colnames(tests_compare)
miletus_data_sil$N[1] <- nrow(StackRMiletus)
miletus_data_sil$n_cluster[1] <- length(unique(hdbcluster$cluster)) - 1
miletus_data_sil$n_noise[1] <- length(which(hdbcluster$cluster == 0))
mat <- as.data.frame(sil_hdbscan[1:67, 1:3])
miletus_data_sil$avg_sil_width[1] <- mean(mat$sil_width)
mat <- mat[-which(mat$cluster == 0), ]
miletus_data_sil$avg_sil_width_without_0[1] <- mean(mat$sil_width)

## ----density_plot_compare_testsamples, echo = FALSE, warning = FALSE, message=FALSE----
p1 <- ggplot(data = tests_compare) +
  geom_density(aes(x = n_cluster, 
                   y = after_stat(density)), 
               fill = 'grey') +
  labs(title = "Number of clusters") +
  theme(panel.background = element_blank(),
        panel.grid = element_blank()) +
  geom_vline(xintercept = miletus_data_sil$n_cluster, 
             col = "red")

p2 <- ggplot(data = tests_compare) +
  geom_density(aes(x = n_noise, 
                   y = after_stat(density)), 
               fill = 'grey') +
  labs(title = "Number of points in noise-cluster") +
  theme(panel.background = element_blank(),
        panel.grid = element_blank()) +
  geom_vline(xintercept = miletus_data_sil$n_noise, 
             col = "red")

p3 <- ggplot(data = tests_compare) +
  geom_density(aes(x = avg_sil_width, 
                   y = after_stat(density)), 
               fill = 'grey') +
  labs(title = "Average Silhouette width including noise-cluster") +
  theme(panel.background = element_blank(),
        panel.grid = element_blank()) + xlim(0, 1) +
  geom_vline(xintercept = miletus_data_sil$avg_sil_width, 
             col = "red")

p4 <- ggplot(data = tests_compare) +
  geom_density(aes(x = avg_sil_width_without_0, 
                   y = after_stat(density)), 
               fill = 'grey') +
  labs(title = "Average Silhouette width without noise-cluster") +
  theme(panel.background = element_blank(),
        panel.grid = element_blank()) + xlim(0, 1) +
  geom_vline(xintercept = miletus_data_sil$avg_sil_width_without_0, 
             col = "red")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, 
             ncol = 2, nrow = 2, widths = c(2, 2))

## ----echo = FALSE-------------------------------------------------------------
StackRMiletus %>% 
  transmute(HDBScan_Cluster, 
            height = scale(height), 
            min_DM = scale(min_DM), 
            max_DM = scale(max_DM), 
            width = scale(width), 
            thickness = scale(thickness)) %>%
  filter(HDBScan_Cluster != 0) %>%
  melt(id.vars = "HDBScan_Cluster") %>%
  ggplot() +
  geom_boxplot(aes(y = value, x = HDBScan_Cluster, fill = HDBScan_Cluster)) +
  facet_wrap( ~ variable, ncol = 5) +
  theme(legend.position = "none", 
        axis.title = element_blank())

## ----echo = FALSE-------------------------------------------------------------
StackRMiletus %>% 
  select(HDBScan_Cluster, height, min_DM, max_DM, width, thickness) %>%
  filter(HDBScan_Cluster != 0) %>%
  melt(id.vars = "HDBScan_Cluster") %>%
  ggplot() +
  geom_boxplot(aes(y = value, x = HDBScan_Cluster, fill = HDBScan_Cluster)) +
  facet_wrap( ~ variable, ncol = 5) +
  theme(legend.position = "none", 
        axis.title = element_blank())