## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(pminternal)
library(Hmisc)

getHdata("gusto")
gusto <- gusto[, c("sex", "age", "hyp", "htn", "hrt", "pmi", "ste", "day30")]

gusto$y <- gusto$day30; gusto$day30 <- NULL

set.seed(234)
gusto <- gusto[sample(1:nrow(gusto), size = 4000),]

## -----------------------------------------------------------------------------

stepglm <- function(data, ...){
  m <- glm(y~., data=data, family="binomial")
  step(m, trace = 0)
}

steppred <- function(model, data, ...){
  predict(model, newdata = data, type = "response")
}

validate(data = gusto, outcome = "y", model_fun = stepglm, 
         pred_fun = steppred, method = "cv_opt", B = 10)


## -----------------------------------------------------------------------------
#library(glmnet)

ridge_fun <- function(data, ...){
  y <- data$y
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  cv <- glmnet::cv.glmnet(x=x, y=y, alpha=0, nfolds = 10, family="binomial")
  lambda <- cv$lambda.min
  
  glmnet::glmnet(x=x, y=y, alpha = 0, lambda = lambda, family="binomial")
}

ridge_predict <- function(model, data, ...){
  # note this is identical to lasso_predict from "pminternal" vignette
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  plogis(glmnet::predict.glmnet(model, newx = x)[,1])
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = ridge_fun, 
         pred_fun = ridge_predict, B = 10)

# the use of package::function in user defined functions 
# is especially important if you want to run 
# boot_* or .632 in parallel via cores argument

# e.g.
# validate(method = ".632", data = gusto, 
#          outcome = "y", model_fun = ridge_fun, 
#          pred_fun = ridge_predict, B = 100, cores = 4)


## ----eval = F-----------------------------------------------------------------
# lognet_fun <- function(data, ...){
# 
#   dots <- list(...)
#   if ("alpha" %in% names(dots)){
#     alpha <- dots[["alpha"]]
#   } else{
#     alpha <- 0
#   }
# 
#   y <- data$y
#   x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
#   x$sex <- as.numeric(x$sex == "male")
#   x$pmi <- as.numeric(x$pmi == "yes")
#   x <- as.matrix(x)
# 
#   cv <- glmnet::cv.glmnet(x=x, y=y, alpha = alpha, nfolds = 10, family="binomial")
#   lambda <- cv$lambda.min
# 
#   glmnet::glmnet(x=x, y=y, alpha = alpha, lambda = lambda, family="binomial")
# }
# 
# validate(method = "cv_optimism", data = gusto,
#          outcome = "y", model_fun = lognet_fun,
#          pred_fun = ridge_predict, B = 10, alpha = 0.5)

## ----eval = F-----------------------------------------------------------------
# enet_fun <- function(data, ...){
# 
#   dots <- list(...)
#   if ("nalpha" %in% names(dots)){
#     nalpha <- dots[["nalpha"]]
#   } else{
#     nalpha <- 21 # 0 to 1 in steps of 0.05
#   }
# 
#   y <- data$y
#   x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
#   x$sex <- as.numeric(x$sex == "male")
#   x$pmi <- as.numeric(x$pmi == "yes")
#   x <- as.matrix(x)
# 
#   # run 10 fold CV for each alpha
#   alphas <- seq(0, 1, length.out = nalpha)
#   res <- lapply(alphas, function(a){
#     cv <- glmnet::cv.glmnet(x=x, y=y, alpha = a, nfolds = 10, family="binomial")
#     list(lambda = cv$lambda.min, bin.dev = min(cv$cvm))
#   })
#   # select result with min binomial deviance
#   j <- which.min(sapply(res, function(x) x$bin.dev))
#   # produce 'final' model with alpha and lambda
#   glmnet::glmnet(x=x, y=y, alpha = alphas[j], lambda = res[[j]][["lambda"]], family="binomial")
# }
# 
# validate(method = "cv_optimism", data = gusto,
#          outcome = "y", model_fun = enet_fun,
#          pred_fun = ridge_predict, B = 10)
# 

## -----------------------------------------------------------------------------
rf_fun <- function(data, ...){
  
  dots <- list(...)
  num.trees <- if ("num.trees" %in% names(dots)) dots[["num.trees"]] else 500
  max.depth <- if ("max.depth" %in% names(dots)) dots[["max.depth"]] else NULL
  min.node.size	<- if ("min.node.size" %in% names(dots)) dots[["min.node.size"]] else 1
  
  # best to make sure y is a factor where '1' is level 2
  data$y <- factor(data$y, levels = 0:1)
  
  ranger::ranger(y~., data = data, probability = TRUE, 
                 num.trees = num.trees, 
                 max.depth = max.depth, 
                 min.node.size = min.node.size)
}

rf_predict <- function(model, data, ...){
  predict(model, data = data)$predictions[, 2] 
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = rf_fun, 
         pred_fun = rf_predict, B = 10)

# instead of unlimited tree depth...
validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = rf_fun, 
         pred_fun = rf_predict, B = 10, max.depth = 3)