---
title: "Machine Learning Nomogram Exemplar"
author:
  - name: Herdiantri Sufriyana
    affiliation:
    - &ibi Institute of Biomedical Informatics, College of Medicine, National 
      Yang Ming Chiao Tung University, Taipei, Taiwan. 
    email: herdi@nycu.edu.tw
  - name: Emily Chia-Yu Su
    affiliation:
    - *ibi
date: "2024-12-24"
output: 
  html_document:
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: false
      smooth_scroll: true
vignette: >
  %\VignetteIndexEntry{Machine Learning Nomogram Exemplar}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

# Programming environment

```{r Load necessary packages}
library(tidyverse)
library(caret)
library(iml)
library(rmlnomogram)
```

# Binary outcome (or class-wise multinomial outcome)

## 1 - Categorical predictors and binary outcome without probability

```{r Example 1 - Train a prediction model, eval=FALSE}
# Load dataset
data("mtcars")

# Preprocess for training a classifier
mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual"))

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- select(mtcars, where(is.factor))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars <- dummyVars(am ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized <- as.data.frame(dummy_vars) |>
  mutate_all(as.factor) |>
  mutate(am = mtcars$am) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model <- train(
  am ~ ., 
  data = mtcars_binarized, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)
```

```{r Example 1 - Entire dataset of combinations of the selected features, eval=FALSE}
# Extract feature names used in the trained model from caret model object
caret_features <- str_remove_all(model$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected <- mtcars_binarized[, caret_features]

# Generate all possible feature combinations for nomogram
nomogram_features <- expand.grid(lapply(mtcars_selected, unique))
```

```{r Example 1 - Load subs 1 to avoid multi-threading, include=FALSE}
data(nomogram_features)
```

```{r table-1, echo=FALSE}
head(nomogram_features)
```

```{r Example 1 - Prediction outputs for the entire dataset, eval=FALSE}
nomogram_outputs <- predict(model, nomogram_features, type = "prob") |>
  select(output = levels(mtcars_binarized$am)[2])
```

```{r Example 1 - Load subs 2 to avoid multi-threading, include=FALSE}
data(nomogram_outputs)
```

```{r table-2, echo=FALSE}
head(nomogram_outputs)
```

```{r Example 1 - Create nomogram}
nomogram <- create_nomogram(nomogram_features, nomogram_outputs)
```

```{r figure-1, echo=FALSE, fig.height=3, fig.width=12}
nomogram
```

```{r Example 1 - Compute SHAP values for the entire dataset, eval=FALSE}
# Prepare data and model for SHAP value calculation using iml
X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")]

# Create a predictor object
predictor <- Predictor$new(model = model, data = X)

# Calculate SHAP values
nomogram_shaps <- list()

for(i in seq(nrow(nomogram_features))){
  shapley <-  Shapley$new(predictor, x.interest = nomogram_features[i, ])
  nomogram_shaps[[i]] <- shapley$results
}

names(nomogram_shaps) <- seq(nrow(nomogram_features))

nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_binarized$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
```

```{r Example 1 - Load subs 3 to avoid multi-threading, include=FALSE}
data(nomogram_shaps)
```

```{r table-3, echo=FALSE}
head(nomogram_shaps)
```

## 2 - Categorical predictors and binary outcome with probability

```{r Example 2 - Create nomogram}
nomogram2 <- create_nomogram(nomogram_features, nomogram_outputs, prob = TRUE)
```

```{r figure-2, echo=FALSE, fig.height=4.5, fig.width=10}
nomogram2
```

```{r Example 2 - Create nomogram with SHAP}
nomogram2_with_shap <-
  create_nomogram(
    nomogram_features, nomogram_outputs, nomogram_shaps
    , prob = TRUE
  )
```

```{r figure-3, echo=FALSE, fig.height=6, fig.width=12}
nomogram2_with_shap
```

## 3 - Categorical with single numerical predictors and binary outcome with probability

```{r Example 3 - Train a prediction model, eval=FALSE}
# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model2 <- train(
  am ~ ., 
  data = mtcars_mixed, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)
```

```{r Example 3 - Entire dataset of combinations of the selected features, eval=FALSE}
# Extract feature names used in the trained model from caret model object
caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected2 <- mtcars_mixed[, caret_features2]

# Generate all possible feature combinations for nomogram
nomogram_features2 <-
  select_if(mtcars_selected2, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected2, is.factor), unique)) |>
  expand.grid()
```

```{r Example 3 - Load subs 1 to avoid multi-threading, include=FALSE}
data(nomogram_features2)
```

```{r table-4, echo=FALSE}
head(nomogram_features2)
```

```{r Example 3 - Prediction outputs for the entire dataset, eval=FALSE}
nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |>
  select(output = levels(mtcars_mixed$am)[2])
```

```{r Example 3 - Load subs 2 to avoid multi-threading, include=FALSE}
data(nomogram_outputs2)
```

```{r Example 3 - Create nomogram}
nomogram3 <- create_nomogram(nomogram_features2, nomogram_outputs2, prob = TRUE)
```

```{r figure-4, echo=FALSE, fig.height=7.5, fig.width=12}
nomogram3
```

```{r Example 3 - Compute SHAP values for the entire dataset, eval=FALSE}
# Prepare data and model for SHAP value calculation using iml
X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")]

# Create a predictor object
predictor2 <- Predictor$new(model = model2, data = X2)

# Calculate SHAP values
nomogram_shaps2 <- list()

for(i in seq(nrow(nomogram_features2))){
  shapley2 <-  Shapley$new(predictor2, x.interest = nomogram_features2[i, ])
  nomogram_shaps2[[i]] <- shapley2$results
}

names(nomogram_shaps2) <- seq(nrow(nomogram_features2))

nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_mixed$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
```

```{r Example 3 - Load subs 3 to avoid multi-threading, include=FALSE}
data(nomogram_shaps2)
```

```{r Example 3 - Create nomogram with SHAP}
nomogram3_with_shap <-
  create_nomogram(
    nomogram_features2, nomogram_outputs2, nomogram_shaps2
    , prob = TRUE
  )
```

```{r figure-5, echo=FALSE, fig.height=7.5, fig.width=12}
nomogram3_with_shap
```

# Continuous outcome

## 4 - Categorical predictors and continuous outcome

```{r Example 4 - Train a prediction model, eval=FALSE}
# Load dataset
data("mtcars")

# Preprocess for training a regressor
outcome <- mtcars$wt

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized2 <- as.data.frame(dummy_vars2) |>
  mutate_all(as.factor) |>
  mutate(wt = outcome) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model3 <- train(
  wt ~ ., 
  data = mtcars_binarized2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)
```

```{r Example 4 - Entire dataset of combinations of the selected features, eval=FALSE}
# Extract feature names used in the trained model from caret model object
caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected3 <- mtcars_binarized2[, caret_features3]

# Generate all possible feature combinations for nomogram
nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique))
```

```{r Example 4 - Load subs 1 to avoid multi-threading, include=FALSE}
data(nomogram_features3)
```

```{r Example 4 - Prediction outputs for the entire dataset, eval=FALSE}
nomogram_outputs3 <- data.frame(output = predict(model3, nomogram_features3))
```

```{r Example 4 - Load subs 2 to avoid multi-threading, include=FALSE}
data(nomogram_outputs3)
```

```{r table-5, echo=FALSE}
head(nomogram_outputs3)
```

```{r Example 4 - Create nomogram}
nomogram4 <- create_nomogram(nomogram_features3, nomogram_outputs3, est = TRUE)
```

```{r figure-6, echo=FALSE, fig.height=4.5, fig.width=10}
nomogram4
```

```{r Example 4 - Compute SHAP values for the entire dataset, eval=FALSE}
# Prepare data and model for SHAP value calculation using iml
X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")]

# Create a predictor object
predictor3 <- Predictor$new(model = model3, data = X3)

# Calculate SHAP values
nomogram_shaps3 <- list()

for(i in seq(nrow(nomogram_features3))){
  shapley3 <-  Shapley$new(predictor3, x.interest = nomogram_features3[i, ])
  nomogram_shaps3[[i]] <- shapley3$results
}

names(nomogram_shaps3) <- seq(nrow(nomogram_features3))

nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
```

```{r Example 4 - Load subs 3 to avoid multi-threading, include=FALSE}
data(nomogram_shaps3)
```

```{r Example 4 - Create nomogram with SHAP}
nomogram4_with_shap <-
  create_nomogram(
    nomogram_features3, nomogram_outputs3, nomogram_shaps3
    , est = TRUE
  )
```

```{r figure-7, echo=FALSE, fig.height=7.5, fig.width=12}
nomogram4_with_shap
```

## 5 - Categorical with single numerical predictors and continuous outcome

```{r Example 5 - Train a prediction model, eval=FALSE}
# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model4 <- train(
  wt ~ ., 
  data = mtcars_mixed2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)
```

```{r Example 5 - Entire dataset of combinations of the selected features, eval=FALSE}
# Extract feature names used in the trained model from caret model object
caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected4 <- mtcars_mixed2[, caret_features4]

# Generate all possible feature combinations for nomogram
nomogram_features4 <-
  select_if(mtcars_selected4, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected4, is.factor), unique)) |>
  expand.grid()
```

```{r Example 5 - Load subs 1 to avoid multi-threading, include=FALSE}
data(nomogram_features4)
```

```{r Example 5 - Prediction outputs for the entire dataset, eval=FALSE}
nomogram_outputs4 <- data.frame(output = predict(model4, nomogram_features4))
```

```{r Example 5 - Load subs 2 to avoid multi-threading, include=FALSE}
data(nomogram_outputs4)
```

```{r Example 5 - Create nomogram}
nomogram5 <- create_nomogram(nomogram_features4, nomogram_outputs4, est = TRUE)
```

```{r figure-8, echo=FALSE, fig.height=7.5, fig.width=12}
nomogram5
```

```{r Example 5 - Compute SHAP values for the entire dataset, eval=FALSE}
# Prepare data and model for SHAP value calculation using iml
X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")]

# Create a predictor object
predictor4 <- Predictor$new(model = model4, data = X4)

# Calculate SHAP values
nomogram_shaps4 <- list()

for(i in seq(nrow(nomogram_features4))){
  shapley4 <-  Shapley$new(predictor4, x.interest = nomogram_features4[i, ])
  nomogram_shaps4[[i]] <- shapley4$results
}

names(nomogram_shaps4) <- seq(nrow(nomogram_features4))

nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
```

```{r Example 5 - Load subs 3 to avoid multi-threading, include=FALSE}
data(nomogram_shaps4)
```

```{r Example 5 - Create nomogram with SHAP}
nomogram4_with_shap <-
  create_nomogram(
    nomogram_features4, nomogram_outputs4, nomogram_shaps4
    , est = TRUE
  )
```

```{r figure-9, echo=FALSE, fig.height=7.5, fig.width=12}
nomogram4_with_shap
```

```{r session-info, echo=TRUE}
sessionInfo()
```