## ----setup, include = FALSE------------------------------------------------------
library(knitr)
opts_chunk$set(
  fig.height = 4, fig.width = 6,
  collapse = TRUE,
  comment = "#>"
)
library(ggplot2)
inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517",
                  "#843860", "#C04384", "#C2C444", "#685457")
theme_inbo <- function(base_size = 12, base_family = "") {
  rect_bg <- "white"
  legend_bg <- "white"
  panel_bg <- "#F3F3F3"
  panel_grid <- "white"
  plot_bg <- "white"
  half_line <- base_size / 2
  ggplot2::theme(
    line = ggplot2::element_line(
      colour = "black", size = 0.5, linetype = 1, lineend = "butt"
    ),
    rect = ggplot2::element_rect(
      fill = rect_bg, colour = "black", size = 0.5, linetype = 1
    ),
    text = ggplot2::element_text(
      family = base_family, face = "plain", colour = "#843860",
      size = base_size, hjust = 0.5, vjust = 0.5, angle = 0, lineheight = 0.9,
      margin = ggplot2::margin(), debug = FALSE
    ),
    axis.line = ggplot2::element_blank(),
    axis.line.x = ggplot2::element_blank(),
    axis.line.y = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    axis.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1
    ),
    axis.text.x.top = NULL,
    axis.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1
    ),
    axis.text.y.right = NULL,
    axis.ticks = ggplot2::element_line(),
    axis.ticks.length = ggplot2::unit(0.15, "cm"),
    axis.title = ggplot2::element_text(colour = "black"),
    axis.title.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2)
    ),
    axis.title.x.top = NULL,
    axis.title.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2),
      angle = 90
    ),
    axis.title.y.right = NULL,
    legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg),
    legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA),
    legend.key.size = ggplot2::unit(1.2, "lines"),
    legend.key.height = NULL,
    legend.key.width = NULL,
    legend.margin = NULL,
    legend.spacing = ggplot2::unit(0.2, "cm"),
    legend.spacing.x = NULL,
    legend.spacing.y = NULL,
    legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    legend.text.align = NULL,
    legend.title = ggplot2::element_text(
      size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black"
    ),
    legend.title.align = NULL,
    legend.position = "right",
    legend.direction = NULL,
    legend.justification = "center",
    legend.box = NULL,
    legend.box.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    legend.box.background = ggplot2::element_rect(
      colour = NA, fill = legend_bg
    ),
    legend.box.spacing = ggplot2::unit(0.2, "cm"),
    panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA),
    panel.border = ggplot2::element_blank(),
    panel.grid = ggplot2::element_line(colour = panel_grid),
    panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25),
    panel.spacing = ggplot2::unit(half_line, "pt"),
    panel.spacing.x = NULL,
    panel.spacing.y = NULL,
    panel.ontop = FALSE,
    strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA),
    strip.text = ggplot2::element_text(
      size = ggplot2::rel(0.8), colour = "#F3F3F3"
    ),
    strip.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = half_line, b = half_line)
    ),
    strip.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = half_line, l = half_line), angle = -90
    ),
    strip.switch.pad.grid = ggplot2::unit(0.1, "cm"),
    strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"),
    strip.placement = "outside",
    plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg),
    plot.title = ggplot2::element_text(
      size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.subtitle = ggplot2::element_text(
      size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.caption = ggplot2::element_text(
      size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    plot.tag = ggplot2::element_text(
      size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5
    ),
    plot.tag.position = "topleft",
    complete = TRUE
  )
}
theme_set(theme_inbo())
update_geom_defaults("line", list(colour = "#356196"))
update_geom_defaults("hline", list(colour = "#356196"))
update_geom_defaults("boxplot", list(colour = "#356196"))
update_geom_defaults("smooth", list(colour = "#356196"))

## ----ratio, fig.cap = "Storage space required using `split_by` relative to storing a single file.", echo = FALSE----
combinations <- expand.grid(
  a = c(0.25, 0.5, 1, 2, 4),
  b = seq(0, 1, length = 41),
  r = c(10, 100, 1000)
)
combinations$ratio <- with(
  combinations,
  (a * b + b + 1) / (a + 1 + 1 / r)
)
ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_line() +
  facet_wrap(~ paste("r =", r)) +
  scale_x_continuous(
    expression(b~{"="}~N[s]~{"/"}~N), # nolint
    labels = function(x) {
      paste0(100 * x, "%")
    }
  ) +
  scale_y_continuous(
    "Relative amount of disk space",
    labels = function(x) {
      paste0(100 * x, "%")
    }
  ) +
  scale_colour_manual(
    paste("a = s", "r", sep = " / "),
    values = inbo_colours,
    labels = c("1/4", "1/2", "1", "2", "4")
  )

## ----load_data, echo = FALSE-----------------------------------------------------
airbag <- readRDS(
  system.file("efficiency", "airbag.rds", package = "git2rdata")
)

## ----set_tmp_dir-----------------------------------------------------------------
library(git2rdata)
root <- tempfile("git2rdata-split-by")
dir.create(root)

## ----get_write_timings, eval = system.file("split_by", "write_timings.rds", package = "git2rdata") == ""----
# library(microbenchmark)
# mb <- microbenchmark(
#   part_1 = write_vc(airbag, "part_1", root, sorting = "X"),
#   part_2 = write_vc(airbag, "part_2", root, sorting = "X", split_by = "airbag"),
#   part_3 = write_vc(airbag, "part_3", root, sorting = "X", split_by = "abcat"),
#   part_4 = write_vc(
#     airbag, "part_4", root, sorting = "X", split_by = c("airbag", "sex")
#   ),
#   part_5 = write_vc(airbag, "part_5", root, sorting = "X", split_by = "dvcat"),
#   part_6 = write_vc(
#     airbag, "part_6", root, sorting = "X", split_by = "yearacc"
#   ),
#   part_15 = write_vc(
#     airbag, "part_15", root, sorting = "X", split_by = c("dvcat", "abcat")
#   ),
#   part_45 = write_vc(
#     airbag, "part_45", root, sorting = "X", split_by = "yearVeh"
#   ),
#   part_270 = write_vc(
#     airbag, "part_270", root, sorting = "X", split_by = c("yearacc", "yearVeh")
#   )
# )
# mb$time <- mb$time / 1e6

## ----store_write_timings, echo = FALSE-------------------------------------------
if (system.file("split_by", "write_timings.rds", package = "git2rdata") == "") {
  dir.create(file.path("..", "inst", "split_by"), showWarnings = FALSE)
  saveRDS(mb, file.path("..", "inst", "split_by", "write_timings.rds"))
} else {
  mb <- readRDS(
    system.file("split_by", "write_timings.rds", package = "git2rdata")
  )
}

## ----plot_write_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for different number of parts."----
mb$combinations <- as.integer(gsub("part_", "", levels(mb$expr)))[mb$expr]
ggplot(mb, aes(x = combinations, y = time)) +
  geom_boxplot(aes(group = combinations)) +
  scale_x_log10("Number of parts") +
  scale_y_log10("Time (in milliseconds)")

## ----get_read_timings, eval = system.file("split_by", "read_timings.rds", package = "git2rdata") == ""----
# mb_r <- microbenchmark(
#   part_1 = read_vc("part_1", root),
#   part_2 = read_vc("part_2", root),
#   part_3 = read_vc("part_3", root),
#   part_4 = read_vc("part_4", root),
#   part_5 = read_vc("part_5", root),
#   part_6 = read_vc("part_6", root),
#   part_15 = read_vc("part_15", root),
#   part_45 = read_vc("part_45", root),
#   part_270 = read_vc("part_270", root)
# )
# mb_r$time <- mb_r$time / 1e6

## ----store_read_timings, echo = FALSE--------------------------------------------
if (system.file("split_by", "read_timings.rds", package = "git2rdata") == "") {
  saveRDS(mb_r, file.path("..", "inst", "split_by", "read_timings.rds"))
} else {
  mb_r <- readRDS(
    system.file("split_by", "read_timings.rds", package = "git2rdata")
  )
}

## ----plot_read_timings, echo = FALSE, fig.cap = "Boxplot of the read timings for the different number of parts."----
mb_r$combinations <- as.integer(gsub("part_", "", levels(mb_r$expr)))[mb_r$expr]
ggplot(mb_r, aes(x = combinations, y = time)) +
  geom_boxplot(aes(group = combinations)) +
  scale_x_log10("Number of parts") +
  scale_y_log10("Time (in milliseconds)")