## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

## ---- echo=TRUE, message=FALSE, warning=FALSE---------------------------------
pkgs <- c("miceFast", "mice", "ggplot2", "dplyr", "data.table")
inst <- lapply(pkgs, library, character.only = TRUE)
set.seed(123456)

## ---- echo=TRUE---------------------------------------------------------------
data(air_miss)  # airquality with additional variables

## ---- echo=TRUE---------------------------------------------------------------
upset_NA(air_miss, 6)

## ---- echo=TRUE---------------------------------------------------------------
# VIF (Variance Inflation Factor) > ~10 suggests collinearity problems.
VIF(
  air_miss,
  posit_y = "Ozone",
  posit_x = c("Solar.R", "Wind", "Temp", "x_character",
              "Day", "weights", "groups")
)

## ---- echo=TRUE---------------------------------------------------------------
air_miss <- air_miss %>%
  # Impute with grouping and weights
  group_by(groups) %>%
  do(mutate(., 
    Solar_R_imp = fill_NA(
      x = .,
      model = "lm_pred",
      posit_y = "Solar.R",
      posit_x = c("Wind", "Temp", "Intercept"),
      w = .[["weights"]]
    )
  )) %>%
  ungroup() %>%
  # Impute a discrete variable
  mutate(x_character_imp = fill_NA(
    x = .,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp")
  )) %>%
  # Log regression because Ozone is roughly log-normal
  mutate(Ozone_imp1 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )) %>%
  # Impute by position indices
  mutate(Ozone_imp2 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )) %>%
  # Imputations (mean of 30 draws)
  mutate(Ozone_imp3 = fill_NA_N(
    x = .,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  mutate(Ozone_imp4 = fill_NA_N(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  group_by(groups) %>%
  # Single evaluation model
  do(mutate(.,
    Ozone_imp5 = fill_NA(
      x = .,
      model = "lm_pred",
      posit_y = "Ozone",
      posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
      w = .[["weights"]],
      logreg = TRUE
    )
  )) %>%
  do(mutate(.,
    Ozone_imp6 = fill_NA_N(
      x = .,
      model = "pmm",
      posit_y = "Ozone",
      posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
      w = .[["weights"]],
      logreg = TRUE,
      k = 20
    )
  )) %>%
  ungroup() %>%
  # Combine imputations
  mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
  # Protect against errors in small groups
  group_by(groups) %>%
  do(mutate(.,
    Ozone_chac_imp = tryCatch(
      fill_NA(
        x = .,
        model = "lda",
        posit_y = "Ozone_chac",
        posit_x = c("Intercept", "Month", "Day", "Temp", "x_character_imp"),
        w = .[["weights"]]
      ),
      error = function(e) .[["Ozone_chac"]]
    )
  )) %>%
  ungroup()

## ---- echo=TRUE---------------------------------------------------------------
# Compare original vs. imputed distributions
compare_imp(air_miss, origin = "Ozone", target = "Ozone_imp_mix")

# Or compare multiple imputation columns
compare_imp(air_miss, origin = "Ozone", target = c("Ozone_imp2", "Ozone_imp_mix"))

## ---- echo=TRUE---------------------------------------------------------------
data(air_miss)
setDT(air_miss)

## ---- echo=TRUE---------------------------------------------------------------
# Imputations
air_miss[, Solar_R_imp := fill_NA_N(
  x = .SD,
  model = "lm_bayes",
  posit_y = "Solar.R",
  posit_x = c("Wind","Temp","Intercept"),
  w = .SD[["weights"]],
  k = 100
), by = .(groups)] %>%
  .[, x_character_imp := fill_NA(
    x = .SD,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind","Temp","groups")
  )] %>%
  .[, Ozone_imp1 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )] %>%
  .[, Ozone_imp2 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4,6),
    logreg = TRUE
  )] %>%
  .[, Ozone_imp3 := fill_NA_N(
    x = .SD,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept","x_character_imp","Wind","Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp4 := fill_NA_N(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept","x_character_imp","Wind","Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp5 := fill_NA(
    x = .SD,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept","x_character_imp","Wind","Temp"),
    w = .SD[["weights"]],
    logreg = TRUE
  ), by = .(groups)] %>%
  .[, Ozone_imp6 := fill_NA_N(
    x = .SD,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept","x_character_imp","Wind","Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 10
  ), by = .(groups)] %>%
  # Average across a set of methods
  .[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
  # tryCatch for small group issues or collinearity
  .[, Ozone_chac_imp := tryCatch(
    fill_NA(
      x = .SD,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c("Intercept","Month","Day","Temp","x_character_imp"),
      w = .SD[["weights"]]
    ),
    error = function(e) .SD[["Ozone_chac"]]
  ), by = .(groups)]

## ---- eval=FALSE--------------------------------------------------------------
#  cd <- new(corrData, nr_cat = 3, n_obs = 1000, means = c(0,1,2), cor_matrix = some_matrix)
#  generated_data <- cd$fill("discrete")

## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(mice::nhanes), intercept = 1, index = 1:nrow(mice::nhanes))
model <- new(miceFast)
model$set_data(data)

# Single imputation, linear model
imp_result <- model$impute("lm_pred", posit_y = 2, posit_x = 5)
model$update_var(2, imp_result$imputations)

# LDA imputation for a categorical variable
model$update_var(3, model$impute("lda", 3, c(1,2))$imputations)

# Multiple imputation example
multi_imp <- model$impute_N("lm_bayes", 4, c(1,2,3), k = 10)
model$update_var(4, multi_imp$imputations)

# Check which variables were updated
model$which_updated()

# Always be cautious with 'update_var' 
# since it permanently alters the object and your data matrix.

## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(airquality[,-5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(airquality[,5]) 

model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)

# Impute with group-wise weighting
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1,3,4,5,6), k = 10)$imputations)

# Inspect updated data, returning to original order via model$get_index()
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)

## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(sample(1:8, nrow(data), replace = TRUE))

model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)

# The data will be automatically sorted by 'groups' during the first imputation:
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1,3,4,5,6), 10)$imputations)

head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)