## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  fig.width = 7,
  fig.height = 6,
  cache = TRUE
)

## ----eval=FALSE---------------------------------------------------------------
#  library(catalytic)
#  
#  set.seed(1)
#  
#  # Function for calculating the mean of logarithmic error between true response and estimated response
#  get_mean_logarithmic_error <- function(Y, est_Y) {
#    Y <- pmax(0.0001, pmin(0.9999, Y))
#    est_Y <- pmax(0.0001, pmin(0.9999, est_Y))
#    return(mean(Y * log(Y / est_Y) + (1 - Y) * log((1 - Y) / (1 - est_Y))))
#  }
#  
#  # Load swim dataset for binomial analysis
#  data("swim")
#  swim_data <- cbind(swim$x, swim$y)
#  
#  # Seperate observation data into train and test data
#  n <- 5 * ncol(swim$x + 1) # Size for training data
#  train_idx <- sample(1:nrow(swim_data), n)
#  train_data <- swim_data[train_idx, ]
#  test_data <- swim_data[-train_idx, ]
#  
#  dim(train_data)

## ----eval=FALSE---------------------------------------------------------------
#  # Fit a logistic regression model (GLM)
#  glm_model <- stats::glm(
#    formula = empyr1 ~ .,
#    family = binomial,
#    data = train_data
#  )
#  
#  predicted_y <- predict(
#    glm_model,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "MLE GLM Binomial Model - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = predicted_y
#    )
#  )

## ----eval=FALSE---------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, predicted_y)
#  plot(roc_curve, main = "ROC Curve (MLE)", col = "blue", lwd = 2)

## ----eval=FALSE---------------------------------------------------------------
#  cat_init <- cat_glm_initialization(
#    formula = empyr1 ~ 1,
#    family = binomial, # Default: gaussian
#    data = train_data,
#    syn_size = 50, # Default: 4 times the number of predictors
#    x_degree = NULL, # Default: NULL, which means degrees of all predictors are 1
#    resample_only = FALSE, # Default: FALSE
#    na_replace = mean # Default: stats::na.omit
#  )
#  
#  cat_init

## ----eval=FALSE---------------------------------------------------------------
#  names(cat_init)

## ----eval=FALSE---------------------------------------------------------------
#  # The number of observations (rows) in the original dataset (`obs_data`)
#  cat_init$obs_size
#  
#  # The information detailing the process of resampling synthetic data
#  cat_init$syn_x_resample_inform

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_model <- cat_glm(
#    formula = empyr1 ~ female + agege35, # Same as `~ female + agege35`
#    cat_init = cat_init,
#    tau = 10 # Default: number of predictors / 4
#  )
#  
#  cat_glm_model

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_predicted_y <- predict(
#    cat_glm_model,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_predicted_y
#    )
#  )

## ----eval=FALSE---------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm)", col = "blue", lwd = 2)

## ----eval=FALSE---------------------------------------------------------------
#  names(cat_glm_model)

## ----eval=FALSE---------------------------------------------------------------
#  # The formula used for modeling
#  cat_glm_model$formula
#  
#  # The fitted GLM model object obtained from `stats::glm` with `tau`
#  cat_glm_model$coefficients

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_cv <- cat_glm_tune(
#    formula = empyr1 ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    risk_estimate_method = "cross_validation", # Default auto-select based on the data size
#    discrepancy_method = "logistic_deviance", # Default auto-select based on family
#    tau_seq = seq(0.1, 5.1, 0.5), # Default is a numeric sequence around the number of predictors / 4. Do not recommand to including 0 for cross validation.
#    cross_validation_fold_num = 3 # Default: 5
#  )
#  
#  cat_glm_tune_cv

## ----eval=FALSE---------------------------------------------------------------
#  plot(cat_glm_tune_cv)

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_boots <- cat_glm_tune(
#    formula = ~., # Same as `empyr1 ~ .`
#    cat_init = cat_init,
#    risk_estimate_method = "parametric_bootstrap", # Default auto-select based on the data size
#    discrepancy_method = "logistic_deviance", # Default auto-select based on family
#    tau_0 = 2, # Default: number of predictors / 4
#    parametric_bootstrap_iteration_times = 10, # Default: 100
#  )
#  
#  cat_glm_tune_boots

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_stein <- cat_glm_tune(
#    formula = ~., # Same as `empyr1 ~ .`
#    cat_init = cat_init,
#    risk_estimate_method = "steinian_estimate", # Default auto-select based on the data size
#    discrepancy_method = "logistic_deviance", # default  auto select based on family
#  )
#  
#  cat_glm_tune_stein

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_auto <- cat_glm_tune(
#    formula = ~.,
#    cat_init = cat_init
#  )
#  
#  cat_glm_tune_auto

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_predicted_y <- predict(
#    cat_glm_tune_auto,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm_tune` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_tune_predicted_y
#    )
#  )

## ----eval=FALSE---------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_tune_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_tune)", col = "blue", lwd = 2)

## ----eval=FALSE---------------------------------------------------------------
#  names(cat_glm_tune_auto)

## ----eval=FALSE---------------------------------------------------------------
#  # The method used for risk estimation
#  cat_glm_tune_auto$risk_estimate_method
#  
#  # Selected optimal tau value from `tau_seq` that minimizes discrepancy error
#  cat_glm_tune_auto$tau

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_model <- cat_glm_bayes(
#    formula = empyr1 ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    tau = 50, # Default: number of predictors / 4
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50, # Default: 1000
#    algorithm = "NUTS" # Default: NUTS
#  )
#  
#  cat_glm_bayes_model

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_model)

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_model, inc_warmup = TRUE)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_predicted_y <- predict(
#    cat_glm_bayes_model,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "MLE GLM Binomial Model - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_bayes)", col = "blue", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  names(cat_glm_bayes_model)

## ----eval = FALSE-------------------------------------------------------------
#  # The number of Markov chains used during MCMC sampling in `rstan`.
#  cat_glm_bayes_model$chain
#  
#  # The mean estimated coefficients from the Bayesian GLM model
#  cat_glm_bayes_model$coefficients

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_model <- cat_glm_bayes_joint(
#    formula = empyr1 ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50, # Default: 1000
#    algorithm = "NUTS", # Default: NUTS
#    tau_alpha = 2, # Default: 2
#    tau_gamma = 1 # Default: 1
#  )
#  
#  cat_glm_bayes_joint_model

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_model)

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_model, inc_warmup = TRUE)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_predicted_y <- predict(
#    cat_glm_bayes_joint_model,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catlytic `cat_glm_bayes_joint` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_joint_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_joint_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_bayes_joint)", col = "blue", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  names(cat_glm_bayes_joint_model)

## ----eval = FALSE-------------------------------------------------------------
#  # The estimated tau parameter from the MCMC sampling `rstan::sampling`,
#  cat_glm_bayes_joint_model$tau
#  
#  # The mean estimated coefficients from the Bayesian GLM model
#  cat_glm_bayes_joint_model$coefficients

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_gibbs_model <- cat_glm_bayes_joint_gibbs(
#    formula = empyr1 ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    iter = 100, # Default: 1000
#    warmup = 50, # Default: 500
#    coefs_iter = 2, # Default: 5
#    tau_0 = 1, # Default: number of predictors / 4
#    tau_alpha = 2, # Default: 2
#    tau_gamma = 1, # Default: 1
#    refresh = TRUE # Default: TRUE
#  )
#  
#  cat_glm_bayes_joint_gibbs_model

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_gibbs_model)

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_gibbs_model, pars = c("female", "hsdip", "numchild"))

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_gibbs_predicted_y <- predict(
#    cat_glm_bayes_joint_gibbs_model,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm_bayes_joint_gibbs` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_joint_gibbs_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_joint_gibbs_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_bayes_joint_gibbs)", col = "blue", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  names(cat_glm_bayes_joint_gibbs_model)

## ----eval = FALSE-------------------------------------------------------------
#  # The Date and time when sampling ended.
#  cat_glm_bayes_joint_gibbs_model$sys_time
#  
#  # The summary statistics
#  cat_glm_bayes_joint_gibbs_model$inform_df

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_tau_lower <- cat_glm_bayes_joint(
#    formula = ~., # Same as `empyr1 ~ .`
#    cat_init = cat_init,
#    binomial_tau_lower = 0.5, # Default: 0.05
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50 # Default: 1000
#  )
#  
#  cat_glm_bayes_joint_tau_lower

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_tau_lower_predicted_y <- predict(
#    cat_glm_bayes_joint_tau_lower,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm_bayes_joint_tau_lower` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_joint_tau_lower_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_joint_tau_lower_predicted_y)
#  plot(roc_curve, main = "ROC Curve (MLE)", col = "blue", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_theta <- cat_glm_bayes_joint(
#    formula = ~., # Same as `empyr1 ~ .`
#    cat_init = cat_init,
#    binomial_joint_theta = TRUE, # Default: FALSE
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50 # Default: 1000
#  )
#  
#  cat_glm_bayes_joint_theta

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_theta_predicted_y <- predict(
#    cat_glm_bayes_joint_theta,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm_bayes_joint_theta` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_joint_theta_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_joint_theta_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_bayes_joint_theta)", col = "blue", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_alpha <- cat_glm_bayes_joint(
#    formula = ~., # Same as `empyr1 ~ .`
#    cat_init = cat_init,
#    binomial_joint_theta = TRUE, # Default: FALSE
#    binomial_joint_alpha = TRUE, # Default: FALSE
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50 # Default: 1000
#  )
#  
#  cat_glm_bayes_joint_alpha

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_alpha_predicted_y <- predict(
#    cat_glm_bayes_joint_alpha,
#    newdata = test_data,
#    type = "response"
#  )
#  
#  cat(
#    "Catalytic `cat_glm_bayes_joint_alpha` - Logarithmic Error:",
#    get_mean_logarithmic_error(
#      Y = test_data$empyr1,
#      est_Y = cat_glm_bayes_joint_alpha_predicted_y
#    )
#  )

## ----eval = FALSE-------------------------------------------------------------
#  roc_curve <- pROC::roc(test_data$empyr1, cat_glm_bayes_joint_alpha_predicted_y)
#  plot(roc_curve, main = "ROC Curve (cat_glm_bayes_joint_alpha)", col = "blue", lwd = 2)