## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----arf----------------------------------------------------------------------
# Load libraries
library(arf)
library(data.table)
library(ggplot2)

# Set seed
set.seed(123, "L'Ecuyer-CMRG")

# Train ARF
arf_iris <- adversarial_rf(iris)

## ----arf2, fig.height=5, fig.width=5------------------------------------------
# Train ARF with no printouts
arf_iris <- adversarial_rf(iris, verbose = FALSE)

# Plot accuracy against iterations (model converges when accuracy <= 0.5)
tmp <- data.frame('Accuracy' = arf_iris$acc, 
                  'Iteration' = seq_len(length(arf_iris$acc)))
ggplot(tmp, aes(Iteration, Accuracy)) + 
  geom_point() + 
  geom_path() +
  geom_hline(yintercept = 0.5, linetype = 'dashed', color = 'red') 

## ----par, eval=FALSE----------------------------------------------------------
#  # Register cores - Unix
#  library(doParallel)
#  registerDoParallel(cores = 2)

## ----par2, eval=FALSE---------------------------------------------------------
#  # Register cores - Windows
#  library(doParallel)
#  cl <- makeCluster(2)
#  registerDoParallel(cl)

## ----arf3---------------------------------------------------------------------
# Rerun ARF, now in parallel and with more trees
arf_iris <- adversarial_rf(iris, num_trees = 100)

## ----forde--------------------------------------------------------------------
# Compute leaf and distribution parameters
params_iris <- forde(arf_iris, iris)

## ----forde_unif---------------------------------------------------------------
# Recompute with uniform density
params_unif <- forde(arf_iris, iris, family = 'unif')

## ----dirichlet----------------------------------------------------------------
# Recompute with additive smoothing
params_alpha <- forde(arf_iris, iris, alpha = 0.1)

# Compare results
head(params_iris$cat)
head(params_alpha$cat)

## ----unity--------------------------------------------------------------------
# Sum probabilities
summary(params_iris$cat[, sum(prob), by = .(f_idx, variable)]$V1)
summary(params_alpha$cat[, sum(prob), by = .(f_idx, variable)]$V1)

## ----forde2-------------------------------------------------------------------
params_iris

## ----lik----------------------------------------------------------------------
# Compute likelihood under truncated normal and uniform distributions
ll <- lik(params_iris, iris, arf = arf_iris)
ll_unif <- lik(params_unif, iris, arf = arf_iris)

# Compare average negative log-likelihood (lower is better)
-mean(ll)
-mean(ll_unif)

## ----iris---------------------------------------------------------------------
head(iris)

## ----iris2, fig.height=5, fig.width=5-----------------------------------------
# Compute likelihoods after marginalizing over Species
iris_without_species <- iris[, -5]
ll_cnt <- lik(params_iris, iris_without_species)

# Compare results
tmp <- data.frame(Total = ll, Partial = ll_cnt)
ggplot(tmp, aes(Total, Partial)) + 
  geom_point() + 
  geom_abline(slope = 1, intercept = 0, color = 'red')

## ----smiley, fig.height=5, fig.width=8----------------------------------------
# Simulate training data
library(mlbench)
x <- mlbench.smiley(1000)
x <- data.frame(x$x, x$classes)
colnames(x) <- c('X', 'Y', 'Class')

# Fit ARF
arf_smiley <- adversarial_rf(x, mtry = 2)

# Estimate parameters
params_smiley <- forde(arf_smiley, x)

# Simulate data
synth <- forge(params_smiley, n_synth = 1000)

# Compare structure
str(x)
str(synth)

# Put it all together
x$Data <- 'Original'
synth$Data <- 'Synthetic'
df <- rbind(x, synth)

# Plot results
ggplot(df, aes(X, Y, color = Class, shape = Class)) + 
  geom_point() + 
  facet_wrap(~ Data)

## ----conditional, fig.height=5, fig.width=5-----------------------------------
# Compute conditional likelihoods
evi <- data.frame(Species = 'setosa')
ll_conditional <- lik(params_iris, query = iris_without_species, evidence = evi)

# Compare NLL across species (shifting to positive range for visualization)
tmp <- iris
tmp$NLL <- -ll_conditional + max(ll_conditional) + 1
ggplot(tmp, aes(Species, NLL, fill = Species)) + 
  geom_boxplot() + 
  scale_y_log10() + 
  ylab('Negative Log-Likelihood') + 
  theme(legend.position = 'none')

## ----cond2--------------------------------------------------------------------
# Data frame of conditioning events
evi <- data.frame(Species = "setosa", 
                  Petal.Width = ">0.3")
evi

## ----cond3--------------------------------------------------------------------
evi <- data.frame(Species = "setosa", 
                  Petal.Width = "(0.3,0.5)")
evi

## ----cond4--------------------------------------------------------------------
# Drawing random weights
evi <- data.frame(f_idx = params_iris$forest$f_idx,
                  wt = rexp(nrow(params_iris$forest)))
evi$wt <- evi$wt / sum(evi$wt)
head(evi)

## ----smiley2, fig.height=5, fig.width=8---------------------------------------
# Simulate class-conditional data for smiley example
evi <- data.frame(Class = 4)
synth2 <- forge(params_smiley, n_synth = 250, evidence = evi)

# Put it all together
synth2$Data <- 'Synthetic'
df <- rbind(x, synth2)

# Plot results
ggplot(df, aes(X, Y, color = Class, shape = Class)) + 
  geom_point() + 
  facet_wrap(~ Data)

## ----cond6--------------------------------------------------------------------
expct(params_smiley, evidence = evi)

## ----imp1---------------------------------------------------------------------
# Load palmer penguins dataset
library(palmerpenguins)
head(penguins)

## ----imp2---------------------------------------------------------------------
# Single imputation
penguins_imp <- impute(penguins)
head(penguins_imp)

## ----imp3, fig.height=6, fig.width=6------------------------------------------
# Multiple imputation
penguins_multiple_imp <- impute(penguins, m = 100)

# Check the distribution of bill_length_mm for sample 4
tmp <- sapply(penguins_multiple_imp, function(dat) dat$bill_length_mm[4])
hist(tmp, breaks = 20)