## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(pcpr)
suppressPackageStartupMessages({
  library(dplyr) # for manipulation of queens
  library(GGally) # for the ggcorr fn
  library(ggplot2) # for plotting
  library(lubridate) # for manipulating dates
  library(magrittr) # for the pipe %>%
  library(tidyr) # for pivoting queens for better plotting
})

plot_matrix <- function(D, ..., lp = "none", title = NULL) {
  D <- t(D)
  if (is.null(colnames(D))) colnames(D) <- paste0("C", 1:ncol(D))
  data.frame(D) %>%
    dplyr::mutate(x = paste0("R", 1:nrow(.))) %>%
    tidyr::pivot_longer(
      tidyselect::all_of(colnames(D)),
      names_to = "y",
      values_to = "value"
    ) %>%
    dplyr::mutate(
      x = factor(x, levels = unique(x)),
      y = factor(y, levels = unique(y))
    ) %>%
    ggplot(aes(x = x, y = y, fill = value)) +
    geom_raster() +
    scale_y_discrete(limits = rev) +
    coord_equal() +
    scale_fill_viridis_c(na.value = "white", ...) +
    theme_minimal() +
    theme(
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.title.x = element_blank(),
      axis.title.y = element_blank(),
      legend.position = lp,
      plot.margin = margin(0, 0, 0, 0),
      aspect.ratio = 1
    ) +
    ggtitle(title)
}

## ----queens raw---------------------------------------------------------------
queens

## ----plot queens--------------------------------------------------------------
plot_matrix(queens[, 2:ncol(queens)])

## ----trends-------------------------------------------------------------------
queens %>%
  pivot_longer(
    colnames(queens)[-1],
    names_to = "chem", values_to = "concentration"
  ) %>%
  filter(!is.na(concentration)) %>%
  ggplot(aes(x = Date, y = concentration)) +
  geom_line() +
  geom_smooth(color = "red", formula = "y ~ x", method = "loess", span = 0.05) +
  facet_wrap(~chem, scales = "free_y") +
  labs(x = "Date", y = "Concentration (Âµg/m^3)") +
  theme_bw()

## ----dates--------------------------------------------------------------------
start_date <- "2015-01-01"
queens_small <- queens %>% filter(Date >= as.Date(start_date))
queens_small

## ----raw correlation----------------------------------------------------------
ggcorr(
  queens_small[, 2:ncol(queens_small)],
  method = "pairwise.complete.obs",
  limits = FALSE, label = FALSE, size = 5
)

## ----distributions------------------------------------------------------------
queens_small %>%
  pivot_longer(
    colnames(queens_small)[-1],
    names_to = "chem", values_to = "concentration"
  ) %>%
  filter(!is.na(concentration)) %>%
  ggplot(aes(x = concentration)) +
  geom_histogram(bins = 50) +
  theme_bw() +
  facet_wrap(~chem, scales = "free")

## ----preprocessing------------------------------------------------------------
queens_scaled <- queens_small %>% select(-Date)
queens_scaled[queens_scaled < 0] <- 0

queens_scaled <- data.frame(scale(queens_scaled, center = FALSE))

queens_scaled$Date <- queens_small$Date

## ----new dists----------------------------------------------------------------
queens_scaled %>%
  pivot_longer(
    colnames(queens_small)[-1],
    names_to = "chem", values_to = "concentration"
  ) %>%
  filter(!is.na(concentration)) %>%
  ggplot(aes(x = concentration)) +
  geom_histogram(bins = 50) +
  theme_bw() +
  facet_wrap(~chem, scales = "free")

## ----sing---------------------------------------------------------------------
D <- as.matrix(queens_scaled %>% select(-Date))
D_imputed <- impute_matrix(D, apply(D, 2, mean, na.rm = TRUE))
singular_values <- sing(D_imputed)
plot(singular_values, type = "b")

## ----gs, eval=FALSE, echo=TRUE------------------------------------------------
# eta_0 <- get_pcp_defaults(D)$eta
# # to get progress bar, could wrap this
# # in a call to progressr::with_progress({ gs <- grid_search_cv(...) })
# gs <- grid_search_cv(
#   D,
#   pcp_fn = rrmc,
#   grid = data.frame(eta = 10^seq(-1, 2, 1) * round(eta_0, 3)),
#   r = 8,
#   parallel_strategy = "multisession",
#   num_workers = 16,
#   verbose = FALSE
# )
# r_star <- gs$summary_stats$r[1]
# eta_star <- round(gs$summary_stats$eta[1], 3)
# gs$summary_stats

## ----gs results, eval=TRUE, echo=FALSE----------------------------------------
gs <- readRDS("rds_files/applied-gs.rds")
r_star <- gs$summary_stats$r[1]
eta_star <- round(gs$summary_stats$eta[1], 3)
gs$summary_stats

## ----pcp----------------------------------------------------------------------
pcp_model <- rrmc(D, r = r_star, eta = eta_star)
L <- pcp_model$L
S <- pcp_model$S

## ----obj----------------------------------------------------------------------
plot(pcp_model$objective, type = "l")

## ----output L-----------------------------------------------------------------
plot_matrix(L)
L_rank <- matrix_rank(L)
L_rank

## ----orig corr----------------------------------------------------------------
ggcorr(
  queens_small[, 2:ncol(queens_small)],
  method = "pairwise.complete.obs",
  limits = FALSE, label = FALSE, size = 5
)

## ----new corr-----------------------------------------------------------------
L_df <- data.frame(L)
colnames(L_df) <- colnames(queens[, 2:ncol(queens)])
ggcorr(
  L_df,
  method = "pairwise.complete.obs",
  limits = FALSE, label = FALSE, size = 5
)

## ----sparse-------------------------------------------------------------------
sparsity(S)
hist(S)
plot_matrix(S)

## ----sparse by col------------------------------------------------------------
S_df <- data.frame(S)
colnames(S_df) <- colnames(queens_small[, 2:ncol(queens_small)])
chems_w_most_extreme_events <- sort(apply(S_df, 2, function(x) sparsity(as.matrix(x))))
chems_w_most_extreme_events

## ----K sparse events----------------------------------------------------------
S_df %>%
  select(K) %>%
  mutate(Date = queens_small$Date) %>%
  filter(K > 0)

## ----NMF, eval=FALSE, echo=TRUE-----------------------------------------------
# library(NMF)
# 
# nmf_mat <- L
# nmf_mat[nmf_mat < 0] <- 0
# 
# res <- nmf(
#   nmf_mat,
#   rank = L_rank,
#   method = "offset", nrun = 30,
#   seed = 0, .opt = "vp16"
# )
# 
# raw_scores <- basis(res)
# raw_loadings <- coef(res)

## ----load NMF, eval=TRUE, echo=FALSE------------------------------------------
raw_scores <- readRDS("rds_files/applied-nmf-raw-scores-W.rds")
raw_loadings <- readRDS("rds_files/applied-nmf-raw-loadings-H.rds")

## ----loadings-----------------------------------------------------------------
loadings <- data.frame(raw_loadings)
colnames(loadings) <- colnames(L_df)
loadings[["Pattern"]] <- paste("Pattern", 1:L_rank)

loadings %>%
  pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>%
  ggplot(aes(x = Chemical, y = Loading)) +
  geom_point(size = 2) +
  geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) +
  facet_wrap(~Pattern, scales = "free_x") +
  theme_bw() +
  theme(
    strip.text.x = element_text(size = 12),
    title = element_text(size = 16),
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    strip.background = element_rect(fill = "white"),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  ) +
  geom_hline(yintercept = 0, linewidth = 0.2) +
  ggtitle("PCP-NMF Loadings")

## ----var exp------------------------------------------------------------------
nmf_scores <- data.frame(raw_scores)
colnames(nmf_scores) <- c(
  "Traffic", "Crustal Dust", "Salt", "Regional/Secondary & Tailpipe Emissions"
)
nmf_sources <- nmf_scores %>%
  mutate(Date = queens_scaled$Date) %>%
  pivot_longer(-Date, names_to = "Pattern", values_to = "Score")

var_explained <- nmf_sources %>%
  group_by(Pattern) %>%
  summarize(patsum = sum(Score)) %>%
  mutate(VarianceExplained = round(100 * patsum / sum(patsum), 1)) %>%
  select(-patsum) %>%
  arrange(desc(VarianceExplained)) %>%
  mutate(PatName = paste(Pattern, paste0("(", VarianceExplained, "% Var Exp)")))

nmf_sources <- nmf_sources %>%
  mutate(
    Pattern = factor(
      Pattern,
      levels = as.character(var_explained$Pattern),
      labels = as.character(var_explained$PatName)
    )
  )

var_explained %>% select(-PatName)

## ----rename patterns----------------------------------------------------------
loadings %>%
  mutate(Pattern = factor(
    colnames(nmf_scores),
    levels = as.character(var_explained$Pattern),
    labels = as.character(var_explained$PatName)
  )) %>%
  pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>%
  ggplot(aes(x = Chemical, y = Loading, color = Pattern)) +
  scale_color_brewer(palette = "Set1") +
  geom_point(size = 2) +
  geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) +
  facet_wrap(~Pattern, scales = "free_x") +
  theme_bw() +
  theme(
    strip.text.x = element_text(size = 12),
    title = element_text(size = 16),
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    strip.background = element_rect(fill = "white"),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  ) +
  geom_hline(yintercept = 0, linewidth = 0.2) +
  ggtitle("PCP-NMF Loadings")

## ----NMF scores---------------------------------------------------------------
yr_num <- length(unique(year(nmf_sources$Date)))

ggplot(nmf_sources, aes(Date, Score, color = Pattern)) +
  scale_color_brewer(palette = "Set1") +
  geom_smooth(method = "loess", span = 0.05) +
  facet_wrap(~Pattern, scales = "free", ncol = 1) +
  xlab("") +
  ylab("Pattern Score") +
  theme_classic() +
  theme(
    legend.position = "none",
    title = element_text(size = 18),
    axis.title.y = element_text(size = 18),
    strip.text = element_text(size = 14),
    axis.text.x = element_text(size = 13),
    axis.text.y = element_text(size = 11)
  ) +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  ggtitle("PCP-NMF Scores over Time")