## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  fig.width = 7,
  fig.height = 6,
  cache = TRUE
)

## -----------------------------------------------------------------------------
library(catalytic)

set.seed(1)

n <- 20 # Number of observations
p <- 5 # Number of predictors

obs_x <- matrix(rnorm(n * (p - 1)), ncol = (p - 1)) # Observation covariates
true_coefs <- rnorm(p) # True coefficient
noise <- rnorm(n) # Noise for more response variability
obs_y <- true_coefs[1] + obs_x %*% true_coefs[-1] + noise # Observation response

obs_data <- as.data.frame(cbind(obs_x, obs_y))
names(obs_data) <- c(paste0("X", 1:(p - 1)), "Y")

# Seperate observation data into train and test data
train_idx <- sample(n, 10)
train_data <- obs_data[train_idx, ]
test_data <- obs_data[-train_idx, ]

print(dim(train_data))

## -----------------------------------------------------------------------------
# Fit a Linear regression model (GLM)
glm_model <- stats::glm(
  formula = Y ~ .,
  family = gaussian,
  data = train_data
)

predicted_y <- predict(
  glm_model,
  newdata = test_data
)

cat(
  "MLE GLM gaussian Model - Mean Square Error (Data):",
  mean((predicted_y - test_data$Y)^2)
)

cat(
  "\nMLE GLM gaussian Model - Sum Square Error (Coefficients):",
  sum((coef(glm_model) - true_coefs)^2)
)

## -----------------------------------------------------------------------------
plot(test_data$Y,
  predicted_y,
  main = "Scatter Plot of true Y vs Predicted Y",
  xlab = "true Y",
  ylab = "Predicted Y",
  pch = 19,
  col = "blue"
)

# Add a 45-degree line for reference
abline(a = 0, b = 1, col = "red", lwd = 2)

## -----------------------------------------------------------------------------
cat_init <- cat_glm_initialization(
  formula = Y ~ 1,
  family = gaussian,
  data = train_data,
  syn_size = 50,
  custom_variance = NULL,
  gaussian_known_variance = FALSE,
  x_degree = NULL,
  resample_only = FALSE,
  na_replace = stats::na.omit
)

cat_init

## -----------------------------------------------------------------------------
names(cat_init)

## -----------------------------------------------------------------------------
# The number of observations (rows) in the original dataset (`obs_data`)
cat_init$obs_size

# The information detailing the process of resampling synthetic data
cat_init$syn_x_resample_inform

## -----------------------------------------------------------------------------
cat_glm_model <- cat_glm(
  formula = Y ~ ., # Same as `~ .`
  cat_init = cat_init, # Output object from `cat_glm_initialization`
  tau = 10 # Defaults to the number of predictors / 4
)

cat_glm_model

## -----------------------------------------------------------------------------
cat_glm_predicted_y <- predict(
  cat_glm_model,
  newdata = test_data
)

cat(
  "Catalytic `cat_glm` - Mean Square Error (Data):",
  mean((cat_glm_predicted_y - test_data$Y)^2)
)

cat(
  "\nCatalytic `cat_glm` - Sum Square Error (Coefficients):",
  sum((coef(cat_glm_model) - true_coefs)^2)
)

## -----------------------------------------------------------------------------
plot(test_data$Y,
  cat_glm_predicted_y,
  main = "Scatter Plot of true Y vs Predicted Y (cat_glm)",
  xlab = "true Y",
  ylab = "Predicted Y (cat_glm)",
  pch = 19,
  col = "blue"
)

# Add a 45-degree line for reference
abline(a = 0, b = 1, col = "red", lwd = 2)

## -----------------------------------------------------------------------------
names(cat_glm_model)

## -----------------------------------------------------------------------------
# The formula used for modeling
cat_glm_model$formula

# The fitted GLM model object obtained from `stats::glm` with `tau`
cat_glm_model$coefficients

## -----------------------------------------------------------------------------
cat_glm_tune_cv <- cat_glm_tune(
  formula = Y ~ ., # Same as `~ .`
  cat_init = cat_init,
  risk_estimate_method = "cross_validation", # Default auto-select based on the data size
  discrepancy_method = "mean_square_error", # Default auto-select based on family
  tau_seq = seq(0, 5, 0.5), # Default is a numeric sequence around the number of predictors / 4
  cross_validation_fold_num = 2 # Default: 5
)

cat_glm_tune_cv

## -----------------------------------------------------------------------------
plot(cat_glm_tune_cv, legend_pos = "topright", text_pos = 3)

## -----------------------------------------------------------------------------
cat_glm_tune_boots <- cat_glm_tune(
  formula = ~., # Same as `Y ~ .`
  cat_init = cat_init,
  risk_estimate_method = "parametric_bootstrap", # Default auto-select based on the data size
  discrepancy_method = "mean_square_error", # Default auto-select based on family
  tau_0 = 2, # Default: 1
  parametric_bootstrap_iteration_times = 5, # Default: 100
)

cat_glm_tune_boots

## -----------------------------------------------------------------------------
cat_glm_tune_mallowian <- cat_glm_tune(
  formula = ~., # Same as `Y ~ .`
  cat_init = cat_init,
  risk_estimate_method = "mallowian_estimate", # Default auto-select based on the data size
  discrepancy_method = "mean_square_error", # Default auto-select based on family
)

cat_glm_tune_mallowian

## -----------------------------------------------------------------------------
cat_glm_tune_auto <- cat_glm_tune(
  formula = ~., # Same as `Y ~ .`
  cat_init = cat_init
)

cat_glm_tune_auto

## ----eval=FALSE---------------------------------------------------------------
#  cat_glm_tune_predicted_y <- predict(
#    cat_glm_tune_auto,
#    newdata = test_data
#  )
#  
#  cat(
#    "Catalytic `cat_glm_tune` - Mean Square Error (Data):",
#    mean((cat_glm_tune_predicted_y - test_data$Y)^2)
#  )
#  
#  cat(
#    "\nCatalytic `cat_glm_tune` - Sum Square Error (Coefficients):",
#    sum((coef(cat_glm_tune_auto) - true_coefs)^2)
#  )

## ----eval=FALSE---------------------------------------------------------------
#  plot(test_data$Y, cat_glm_tune_predicted_y,
#    main = "Scatter Plot of true Y vs Predicted Y (cat_glm_tune)",
#    xlab = "true Y",
#    ylab = "Predicted Y (cat_glm_tune)",
#    pch = 19,
#    col = "blue"
#  )
#  
#  # Add a 45-degree line for reference
#  abline(a = 0, b = 1, col = "red", lwd = 2)

## ----eval=FALSE---------------------------------------------------------------
#  names(cat_glm_tune_auto)

## ----eval=FALSE---------------------------------------------------------------
#  # The method used for risk estimation
#  cat_glm_tune_auto$risk_estimate_method
#  
#  # Selected optimal tau value from `tau_seq` that minimizes discrepancy error
#  cat_glm_tune_auto$tau

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_model <- cat_glm_bayes(
#    formula = Y ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    tau = 50, # Default: number of predictors / 4
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50, # Default: 1000
#    algorithm = "NUTS", # Default: NUTS
#    gaussian_variance_alpha = 1, # Default: number of predictors
#    gaussian_variance_beta = 1 # Default: number of predictors times variance of observation response
#  )
#  
#  cat_glm_bayes_model

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_model)

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_model, inc_warmup = TRUE)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_predicted_y <- predict(
#    cat_glm_bayes_model,
#    newdata = test_data
#  )
#  
#  cat(
#    "Catalytic cat_glm_bayes - Mean Square Error (Data):",
#    mean((cat_glm_bayes_predicted_y - test_data$Y)^2)
#  )
#  
#  cat(
#    "\nCatalytic cat_glm_bayes - Sum Square Error (Coefficients):",
#    sum((coef(cat_glm_bayes_model) - true_coefs)^2)
#  )

## ----eval = FALSE-------------------------------------------------------------
#  plot(test_data$Y, cat_glm_bayes_predicted_y,
#    main = "Scatter Plot of true Y vs Predicted Y (cat_glm_bayes)",
#    xlab = "true Y",
#    ylab = "Predicted Y (cat_glm_bayes)",
#    pch = 19,
#    col = "blue"
#  )
#  
#  # Add a 45-degree line for reference
#  abline(a = 0, b = 1, col = "red", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  names(cat_glm_bayes_model)

## ----eval = FALSE-------------------------------------------------------------
#  # The number of Markov chains used during MCMC sampling in `rstan`.
#  cat_glm_bayes_model$chain
#  
#  # The mean estimated coefficients from the Bayesian GLM model
#  cat_glm_bayes_model$coefficients

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_model <- cat_glm_bayes_joint(
#    formula = Y ~ ., # Same as `~ .`
#    cat_init = cat_init,
#    chains = 1, # Default: 4
#    iter = 100, # Default: 2000
#    warmup = 50, # Default: 1000
#    algorithm = "NUTS", # Default: NUTS
#    tau_alpha = 2, # Default: 2
#    tau_gamma = 1 # Default: 1
#  )
#  
#  cat_glm_bayes_joint_model

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_model)

## ----eval = FALSE-------------------------------------------------------------
#  traceplot(cat_glm_bayes_joint_model, inc_warmup = TRUE)

## ----eval = FALSE-------------------------------------------------------------
#  cat_glm_bayes_joint_predicted_y <- predict(
#    cat_glm_bayes_joint_model,
#    newdata = test_data
#  )
#  
#  cat(
#    "Catalytic `cat_glm_bayes_joint` - Mean Square Error (Data):",
#    mean((cat_glm_bayes_joint_predicted_y - test_data$Y)^2)
#  )
#  
#  cat(
#    "\nCatalytic `cat_glm_bayes_joint` - Sum Square Error (Coefficients):",
#    sum((coef(cat_glm_bayes_joint_model) - true_coefs)^2)
#  )

## ----eval = FALSE-------------------------------------------------------------
#  plot(test_data$Y,
#    cat_glm_bayes_joint_predicted_y,
#    main = "Scatter Plot of true Y vs Predicted Y (cat_glm_bayes_joint)",
#    xlab = "true Y",
#    ylab = "Predicted Y (cat_glm_bayes_joint)",
#    pch = 19,
#    col = "blue"
#  )
#  
#  # Add a 45-degree line for reference
#  abline(a = 0, b = 1, col = "red", lwd = 2)

## ----eval = FALSE-------------------------------------------------------------
#  names(cat_glm_bayes_joint_model)

## ----eval = FALSE-------------------------------------------------------------
#  # The estimated tau parameter from the MCMC sampling `rstan::sampling`,
#  cat_glm_bayes_joint_model$tau
#  
#  # The mean estimated coefficients from the Bayesian GLM model
#  cat_glm_bayes_joint_model$coefficients