## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----echo = TRUE, eval = FALSE------------------------------------------------
#  install.packages("sgboost")

## ----eval = FALSE-------------------------------------------------------------
#  remotes::install_github(
#    "FabianObster/sgboost",
#    build_vignettes = TRUE, dependencies = TRUE
#  )

## ----warning=FALSE, message=FALSE---------------------------------------------
library(mboost)
library(sgboost)
library(dplyr)
library(ggplot2)

## ----warning=FALSE,message=FALSE, eval=TRUE-----------------------------------
set.seed(10)
n <- 100
p <- 200
X <- matrix(data = rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)
beta_star <- c(
  rep(5, 5), c(5, -5, 2, 0, 0), rep(-5, 5),
  c(2, -3, 8, 0, 0), rep(0, (p - 20))
)
groups <- rep(1:(p / 5), each = 5)

# Linear regression model
eps <- rnorm(n, mean = 0, sd = 1)
y <- X %*% beta_star + eps

# Logistic regression model
pr <- 1 / (1 + exp(-X %*% beta_star))
y_binary <- as.factor(rbinom(n, 1, pr))

# Input data.frames
df <- X %>%
  as.data.frame() %>%
  mutate_all(function(x) {
    as.numeric(scale(x))
  }) %>%
  mutate(y = as.numeric(y), y_binary = y_binary)
group_df <- data.frame(
  group_name = groups,
  variable_name = head(colnames(df), -2)
)

## ----eval=TRUE----------------------------------------------------------------
sgb_formula_linear <- create_formula(
  alpha = 0.4, group_df = group_df, outcome_name = "y", intercept = FALSE,
  group_name = "group_name", var_name = "variable_name",
)
sgb_formula_binary <- create_formula(
  alpha = 0.4, group_df = group_df, outcome_name = "y_binary", intercept = FALSE,
  group_name = "group_name", var_name = "variable_name",
)

## ----eval=TRUE----------------------------------------------------------------
sgb_model_linear <- mboost(
  formula = sgb_formula_linear, data = df,
  control = boost_control(nu = 1, mstop = 600)
)
# cv_sgb_model_linear <- cvrisk(sgb_model_linear,
#                               folds = cv(model.weights(sgb_model_linear),
#                                          type = 'kfold', B = 10))
sgb_model_binary <- mboost(
  formula = sgb_formula_binary, data = df, family = Binomial(),
  control = boost_control(nu = 1, mstop = 600)
)
# cv_sgb_model_binary <- cvrisk(sgb_model_binary,
#                               folds = cv(model.weights(sgb_model_linear),
#                                          type = 'kfold', B = 10))
# mstop(cv_sgb_model_linear)
# mstop(cv_sgb_model_binary)
# plot(cv_sgb_model_linear)
# plot(cv_sgb_model_binary)
sgb_model_linear <- sgb_model_linear[320]
sgb_model_binary <- sgb_model_binary[540]

## ----eval=TRUE, fig.align='center', fig.height=5, fig.width=7-----------------
get_varimp(sgb_model = sgb_model_linear)$varimp %>% slice(1:10)
get_varimp(sgb_model = sgb_model_linear)$group_importance
plot_varimp(sgb_model = sgb_model_linear, n_predictors = 15)

## ----eval=TRUE, fig.align='center', fig.height=5, fig.width=7-----------------
get_coef(sgb_model = sgb_model_linear)$aggregate %>% slice(1:10)
get_coef(sgb_model = sgb_model_linear)$raw %>% slice(1:10)

## ----eval=TRUE, fig.align='center', fig.height=5, fig.width=7-----------------
plot_effects(sgb_model = sgb_model_binary, n_predictors = 10, base_size = 10)
plot_effects(sgb_model = sgb_model_binary, n_predictors = 10, plot_type = "clock", base_size = 10)
plot_effects(sgb_model = sgb_model_binary, n_predictors = 10, plot_type = "scatter", base_size = 10)

## ----eval=TRUE, fig.align='center', fig.height=5, fig.width=7-----------------
plot_path(sgb_model = sgb_model_linear[100])
plot_path(sgb_model = sgb_model_binary[100])