## ----setup, include = FALSE---------------------------------------------------
library(knitr)
opts_chunk$set(
  fig.height = 4, fig.width = 6,
  collapse = TRUE,
  comment = "#>"
)
library(ggplot2)
inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517",
                  "#843860", "#C04384", "#C2C444", "#685457")
theme_inbo <- function(base_size = 12, base_family = "") {
  rect_bg <- "white"
  legend_bg <- "white"
  panel_bg <- "#F3F3F3"
  panel_grid <- "white"
  plot_bg <- "white"
  half_line <- base_size / 2
  ggplot2::theme(
    line = ggplot2::element_line(colour = "black", size = 0.5, linetype = 1,
                        lineend = "butt"),
    rect = ggplot2::element_rect(fill = rect_bg, colour = "black", size = 0.5,
                        linetype = 1),
    text = ggplot2::element_text(family = base_family, face = "plain",
                        colour = "#843860", size = base_size, hjust = 0.5,
                        vjust = 0.5, angle = 0, lineheight = 0.9,
                        margin = ggplot2::margin(), debug = FALSE),
    axis.line = ggplot2::element_blank(),
    axis.line.x = ggplot2::element_blank(),
    axis.line.y = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    axis.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1
    ),
    axis.text.x.top = NULL,
    axis.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1
    ),
    axis.text.y.right = NULL,
    axis.ticks = ggplot2::element_line(),
    axis.ticks.length = ggplot2::unit(0.15, "cm"),
    axis.title = ggplot2::element_text(colour = "black"),
    axis.title.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2)
    ),
    axis.title.x.top = NULL,
    axis.title.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2),
      angle = 90
    ),
    axis.title.y.right = NULL,
    legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg),
    legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA),
    legend.key.size = ggplot2::unit(1.2, "lines"),
    legend.key.height = NULL,
    legend.key.width = NULL,
    legend.margin = NULL,
    legend.spacing = ggplot2::unit(0.2, "cm"),
    legend.spacing.x = NULL,
    legend.spacing.y = NULL,
    legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    legend.text.align = NULL,
    legend.title = ggplot2::element_text(
      size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black"
    ),
    legend.title.align = NULL,
    legend.position = "right",
    legend.direction = NULL,
    legend.justification = "center",
    legend.box = NULL,
    legend.box.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    legend.box.background = ggplot2::element_rect(
      colour = NA, fill = legend_bg
    ),
    legend.box.spacing = ggplot2::unit(0.2, "cm"),
    panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA),
    panel.border = ggplot2::element_blank(),
    panel.grid = ggplot2::element_line(colour = panel_grid),
    panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25),
    panel.spacing = ggplot2::unit(half_line, "pt"),
    panel.spacing.x = NULL,
    panel.spacing.y = NULL,
    panel.ontop = FALSE,
    strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA),
    strip.text = ggplot2::element_text(
      size = ggplot2::rel(0.8), colour = "#F3F3F3"
    ),
    strip.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = half_line, b = half_line)
    ),
    strip.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = half_line, l = half_line), angle = -90
    ),
    strip.switch.pad.grid = ggplot2::unit(0.1, "cm"),
    strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"),
    strip.placement = "outside",
    plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg),
    plot.title = ggplot2::element_text(
      size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.subtitle = ggplot2::element_text(
      size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.caption = ggplot2::element_text(
      size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    plot.tag = ggplot2::element_text(
      size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5
    ),
    plot.tag.position = "topleft",
    complete = TRUE
  )
}
theme_set(theme_inbo())
update_geom_defaults("line", list(colour = "#356196"))
update_geom_defaults("hline", list(colour = "#356196"))
update_geom_defaults("boxplot", list(colour = "#356196"))

## ----download_data, eval = system.file("efficiency", "airbag.rds", package = "git2rdata") == ""----
# airbag <- read.csv(
#   "https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/nassCDS.csv"
# )
# airbag$dead <- airbag$dead == "dead"
# airbag$airbag <- airbag$airbag == "airbag"
# airbag$seatbelt <- airbag$seatbelt == "belted"
# airbag$dvcat <- as.ordered(airbag$dvcat)

## ----load_data, echo = FALSE--------------------------------------------------
if (system.file("efficiency", "airbag.rds", package = "git2rdata") == "") {
  saveRDS(airbag, file.path("..", "inst", "efficiency", "airbag.rds"))
} else {
  airbag <- readRDS(
    system.file("efficiency", "airbag.rds", package = "git2rdata")
  )
}

## ----data_structure-----------------------------------------------------------
str(airbag)

## ----set_tmp_dir--------------------------------------------------------------
library(git2rdata)
root <- tempfile("git2rdata-efficient")
dir.create(root)

## ----file_size----------------------------------------------------------------
write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t")
base_size <- file.size(file.path(root, "base_R.tsv"))

saveRDS(airbag, file.path(root, "base_R.rds"))
rds_size <- file.size(file.path(root, "base_R.rds"))

fn <- write_vc(airbag, "airbag_optimize", root, sorting = "X")
optim_size <- sum(file.size(file.path(root, fn)))

fn <- write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE)
verbose_size <- sum(file.size(file.path(root, fn)))

## ----table_file_size, echo = FALSE--------------------------------------------
kable(
  data.frame(
    method = c("saveRDS()", "write_vc(), optimized", "write_vc(), verbose",
               "write.table()"),
    file_size = c(rds_size, optim_size, verbose_size, base_size) / 2 ^ 10,
    relative = c(rds_size, optim_size, verbose_size, base_size) / base_size
  ),
  caption = "Resulting file sizes (in kB) and file sizes relative to the size of
  write.table().",
  digits = 2
)

## ----factor_label_length, echo = FALSE, fig.cap = "Effect of the label length on the efficiency of storing factor optimized, assuming 1000 observations", warning = FALSE----
ratio <- function(label_length = 1:20, n_levels = 9, n_obs = 1000) {
  meta_length <- 63 + (5 + label_length + floor(log10(n_levels))) * n_levels
  optimized <- n_obs * mean(ceiling(log10(seq_len(n_levels) + 1)))
  verbose <- n_obs * label_length
  ifelse(
    62 ^ label_length >= n_levels,
    (optimized + meta_length) / (verbose + meta_length),
    NA
  )
}
lengths <- 1:50
f_ratio <- rbind(
  data.frame(
    label_length = lengths,
    levels = 1000,
    ratio = ratio(lengths, 1000, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 100,
    ratio = ratio(lengths, 100, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 10,
    ratio = ratio(lengths, 10, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 3,
    ratio = ratio(lengths, 3, 1000)
  )
)
f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3))
ggplot(f_ratio, aes(x = label_length, y = ratio, colour = levels)) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_line() +
  scale_x_continuous("label length (characters)") +
  scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "),
                     breaks = seq(0, 1.25, by = 0.25)) +
  scale_colour_manual("number of \nlevels", values = inbo_colours)

## ----factor_observations, echo = FALSE, fig.cap = "Effect of the number of observations on the efficiency of storing factor optimized assuming labels with 10 characters"----
n_obs <- 10 ^ seq(log10(1), log10(10000), length = 41)
f_ratio <- rbind(
  data.frame(
    observations = n_obs,
    levels = 3,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 3)
  ),
  data.frame(
    observations = n_obs,
    levels = 10,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 10)
  ),
  data.frame(
    observations = n_obs,
    levels = 100,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 100)
  ),
  data.frame(
    observations = n_obs,
    levels = 1000,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 1000)
  )
)
f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3))
ggplot(f_ratio, aes(x = observations, y = ratio, colour = levels)) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_line() +
  scale_x_log10() +
  scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "),
                     breaks = seq(0, 1.25, by = 0.25)) +
  scale_colour_manual("number of \nlevels", values = inbo_colours)

## ----git_size, eval = system.file("efficiency", "git_size.rds", package = "git2rdata") == ""----
# library(git2r)
# tmp_repo <- function() {
#   root <- tempfile("git2rdata-efficient-git")
#   dir.create(root)
#   repo <- git2r::init(root)
#   git2r::config(repo, user.name = "me", user.email = "me@me.com")
#   return(repo)
# }
# commit_and_size <- function(repo, filename) {
#   add(repo, filename)
#   commit(repo, "test", session = TRUE)
#   git_size <- system(
#     sprintf("cd %s\ngit gc\ngit count-objects -v", dirname(repo$path)),
#     intern = TRUE
#   )
#   git_size <- git_size[grep("size-pack", git_size)]
#   as.integer(gsub(".*: (.*)", "\\1", git_size))
# }
# 
# repo_wt <- tmp_repo()
# repo_wts <- tmp_repo()
# repo_rds <- tmp_repo()
# repo_wvco <- tmp_repo()
# repo_wvcv <- tmp_repo()
# 
# repo_size <- replicate(
#   100, {
#     observed_subset <- rbinom(nrow(airbag), size = 1, prob = 0.9) == 1
#     this <- airbag[
#       sample(which(observed_subset)),
#       sample(ncol(airbag))
#     ]
#     this_sorted <- airbag[observed_subset, ]
#     fn_wt <- file.path(workdir(repo_wt), "base_R.tsv")
#     write.table(this, fn_wt, sep = "\t")
#     fn_wts <- file.path(workdir(repo_wts), "base_R.tsv")
#     write.table(this_sorted, fn_wts, sep = "\t")
#     fn_rds <- file.path(workdir(repo_rds), "base_R.rds")
#     saveRDS(this, fn_rds)
#     fn_wvco <- write_vc(this, "airbag_optimize", repo_wvco, sorting = "X")
#     fn_wvcv <- write_vc(
#       this, "airbag_verbose", repo_wvcv, sorting = "X", optimize = FALSE
#     )
#     c(
#       write.table = commit_and_size(repo_wt, fn_wt),
#       write.table.sorted = commit_and_size(repo_wts, fn_wts),
#       saveRDS = commit_and_size(repo_rds, fn_rds),
#       write_vc.optimized = commit_and_size(repo_wvco, fn_wvco),
#       write_vc.verbose = commit_and_size(repo_wvcv, fn_wvcv)
#     )
# })

## ----store_git_size, echo = FALSE---------------------------------------------
if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") {
  saveRDS(repo_size, file.path("..", "inst", "efficiency", "git_size.rds"))
} else {
  repo_size <- readRDS(
    system.file("efficiency", "git_size.rds", package = "git2rdata")
  )
}

## ----plot_git_size, echo = FALSE, fig.cap = "Size of the git history using the different storage methods."----
rs <- lapply(
  rownames(repo_size),
  function(x) {
    if (x == "saveRDS") {
      fun <- "saveRDS"
      optimized <- "yes"
    } else if (x == "write_vc.optimized") {
      fun <- "write_vc"
      optimized <- "yes"
    } else if (x == "write_vc.verbose") {
      fun <- "write_vc"
      optimized <- "no"
    } else if (x == "write.table") {
      fun <- "write.table"
      optimized <- "no"
    } else if (x == "write.table.sorted") {
      fun <- "write.table"
      optimized <- "yes"
    }
    data.frame(commit = seq_along(repo_size[x, ]), size = repo_size[x, ],
               rel_size = repo_size[x, ] / repo_size["write.table.sorted", ],
               fun = fun, optimized = optimized, stringsAsFactors = FALSE)
  }
)
rs <- do.call(rbind, rs)
rs$optimized <- factor(rs$optimized, levels = c("yes", "no"))
ggplot(
  rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized)
) +
  geom_line() +
  scale_y_continuous("repo size (in MiB)") +
  scale_colour_manual("function", values = inbo_colours)

## ----plot_rel_git_size, echo = FALSE, fig.cap = "Relative size of the git repository when compared to write.table()."----
ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) +
  geom_line() +
  scale_y_continuous("size relative to sorted write.table()", breaks = 0:10) +
  scale_colour_manual("function", values = inbo_colours)

## ----get_file_timings, eval = system.file("efficiency", "file_timings.rds", package = "git2rdata") == ""----
# library(microbenchmark)
# mb <- microbenchmark(
#   write.table = write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t"),
#   saveRDS = saveRDS(airbag, file.path(root, "base_R.rds")),
#   write_vc.optim = write_vc(airbag, "airbag_optimize", root, sorting = "X"),
#   write_vc.verbose = write_vc(airbag, "airbag_verbose", root, sorting = "X",
#                               optimize = FALSE)
# )
# mb$time <- mb$time / 1e6

## ----store_file_timings, echo = FALSE-----------------------------------------
if (
  system.file("efficiency", "file_timings.rds", package = "git2rdata") == ""
) {
  saveRDS(mb, file.path("..", "inst", "efficiency", "file_timings.rds"))
} else {
  mb <- readRDS(
    system.file("efficiency", "file_timings.rds", package = "git2rdata")
  )
}

## ----median_write, echo = FALSE-----------------------------------------------
median_time <- aggregate(time ~ expr, data = mb, FUN = median)
write_ratio <- 100 * median_time$time /
  median_time$time[median_time$expr == "write.table"]
names(write_ratio) <- median_time$expr

## ----plot_file_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for the different methods."----
mb$expr <- reorder(mb$expr, mb$time, FUN = median)
levels(mb$expr) <- gsub("write_vc\\.", "write_vc\n", levels(mb$expr))
ggplot(mb, aes(x = expr, y = time)) +
  geom_boxplot() +
  scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) +
  theme(axis.title.x = ggplot2::element_blank())

## ----get_read_timings, eval = system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""----
# mb <- microbenchmark(
#   read.table = read.table(file.path(root, "base_R.tsv"), header = TRUE,
#                           sep = "\t"),
#   readRDS = readRDS(file.path(root, "base_R.rds")),
#   read_vc.optim = read_vc("airbag_optimize", root),
#   read_vc.verbose = read_vc("airbag_verbose", root)
# )
# mb$time <- mb$time / 1e6

## ----store_read_timings, echo = FALSE-----------------------------------------
if (
  system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""
) {
  saveRDS(mb, file.path("..", "inst", "efficiency", "read_timings.rds"))
} else {
  mb <- readRDS(
    system.file("efficiency", "read_timings.rds", package = "git2rdata")
  )
}

## ----median_read, echo = FALSE------------------------------------------------
median_time <- aggregate(time ~ expr, data = mb, FUN = median)
read_ratio <- 100 * median_time$time /
  median_time$time[median_time$expr == "read.table"]
names(read_ratio) <- median_time$expr

## ----plot_read_timings, echo = FALSE, fig.cap = "Boxplots for the read timings for the different methods."----
mb$expr <- factor(
  mb$expr,
  levels = c("readRDS", "read.table", "read_vc.optim", "read_vc.verbose")
)
levels(mb$expr) <- gsub("read_vc\\.", "read_vc\n", levels(mb$expr))
ggplot(mb, aes(x = expr, y = time)) +
  geom_boxplot() +
  scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) +
  theme(axis.title.x = ggplot2::element_blank())