## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----message=FALSE, warning=FALSE---------------------------------------------
library(ranger)
library(optRF)
SNPdata[1:5,1:5]

## ----echo=FALSE, message=FALSE------------------------------------------------
load("opt_importance_vignette_initData.Rda")

## ----eval=FALSE, message=FALSE------------------------------------------------
# set.seed(123) # Set a seed for reproducibility
# RF_model = ranger(y=SNPdata[,1], x=SNPdata[,-1], write.forest = TRUE, importance="permutation")
# D_VI = data.frame(variable = names(SNPdata)[-1], importance = RF_model$variable.importance)
# D_VI = D_VI[order(D_VI$importance, decreasing=TRUE),]

## -----------------------------------------------------------------------------
head(D_VI)

## ----eval=FALSE, message=FALSE------------------------------------------------
# set.seed(321) # Set a seed for reproducibility
# RF_model_2 = ranger(y=SNPdata[,1], x=SNPdata[,-1], write.forest = TRUE, importance="permutation")
# D_VI_2 = data.frame(variable = names(SNPdata)[-1], importance = RF_model_2$variable.importance)
# D_VI_2 = D_VI_2[order(D_VI_2$importance, decreasing=TRUE),]

## ----fig.width=6, fig.height=4.5, fig.align='center'--------------------------
M = merge(D_VI, D_VI_2, by="variable")
plot(M$importance.x, M$importance.y,
     xlab="Variable importances in the first run", ylab="Variable importances in the second run")
cor(M$importance.x, M$importance.y)

## ----eval=FALSE, message=FALSE------------------------------------------------
# num.trees_values = c(500, 2500, 5000, 10000, 15000, 20000)
# result = data.frame()
# for(i in num.trees_values){
# 
#   start.time = Sys.time()
# 
#   set.seed(123)
#   RF_model_1 = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=i,
#                       write.forest = TRUE, importance="permutation")
#   D_VI_1 = data.frame(variable = names(SNPdata)[-1], importance = RF_model_1$variable.importance)
# 
#   set.seed(321)
#   RF_model_2 = ranger(y=SNPdata[,1], x=SNPdata[,-1],  num.trees=i,
#                       write.forest = TRUE, importance="permutation")
#   D_VI_2 = data.frame(variable = names(SNPdata)[-1], importance = RF_model_2$variable.importance)
#   end.time = Sys.time()
# 
#   M = merge(D_VI_1, D_VI_2, by="variable")
#   result = rbind(result, data.frame(number_of_trees = i,
#                                     variable_importance_stability = cor(M$importance.x, M$importance.y),
#                                     computation_time = (end.time - start.time)/2))
# }

## -----------------------------------------------------------------------------
result

## ----fig.width=8, fig.height=4.5, fig.align='center'--------------------------
par(mfrow=c(1,2))
plot(variable_importance_stability ~ number_of_trees, data=result)
plot(computation_time ~ number_of_trees, data=result)
abline(lm(result$computation_time ~ result$number_of_trees), lwd=2, col="grey")

## ----echo=FALSE, message=FALSE------------------------------------------------
load("opt_importance_vignette_optData.Rda")

## ----eval=FALSE, message=FALSE, warning=FALSE---------------------------------
# set.seed(123)
# optRF_result = opt_importance(y=SNPdata[,1], X=SNPdata[,-1])

## ----message=FALSE------------------------------------------------------------
summary(optRF_result)

## ----eval=FALSE, message=FALSE------------------------------------------------
# set.seed(123)
# RF_model_1 = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=optRF_result$recommendation,
#                    write.forest = TRUE, importance="permutation")
# D_VI_1 = data.frame(variable = names(SNPdata)[-1],
#                     importance = RF_model_1$variable.importance)
# 
# set.seed(321)
# RF_model_2 = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=optRF_result$recommendation,
#                    write.forest = TRUE, importance="permutation")
# D_VI_2 = data.frame(variable = names(SNPdata)[-1],
#                     importance = RF_model_2$variable.importance)
# 
# M = merge(D_VI_1, D_VI_2, by="variable")

## ----message=FALSE, fig.width=6, fig.height=4.5, fig.align='center'-----------
plot(M$importance.x, M$importance.y,
     xlab="Variable importances in the first run", ylab="Variable importances in the second run")
cor(M$importance.x, M$importance.y)

## ----eval=FALSE, message = FALSE, warning=FALSE-------------------------------
# set.seed(123)
# RF_model = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=optRF_result$recommendation,
#                  write.forest = TRUE, importance="permutation")

## ----message = FALSE, warning=FALSE, fig.width=6, fig.height=4.5, fig.align='center'----
hist(RF_model$variable.importance, xlim=c(-10,50),
     main="Histogram of variable importances", xlab="")

## ----fig.width=6, fig.height=4.5, fig.align='center'--------------------------
hist(RF_model$variable.importance, ylim=c(0,20), xlim=c(-10,50),
     main="Histogram of variable importances", xlab="")
abline(v=5, col="red", lwd=4)
selection_size = sum(RF_model$variable.importance>5)
selection_size

## ----eval=FALSE, message = FALSE, warning=FALSE-------------------------------
# set.seed(123)
# optRF_result_2 = opt_importance(y=SNPdata[,1], X=SNPdata[,-1],
#                               recommendation = "selection",
#                               alpha = selection_size)

## ----message=FALSE------------------------------------------------------------
summary(optRF_result_2)

## ----eval=FALSE, message = FALSE, warning=FALSE-------------------------------
# set.seed(123)
# RF_model_1 = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=optRF_result_2$recommendation,
#                    write.forest = TRUE, importance="permutation")
# D_VI_1 = data.frame(variable = names(SNPdata)[-1],
#                     importance = RF_model_1$variable.importance)
# D_VI_1 = D_VI_1[order(D_VI_1$importance, decreasing=TRUE),]
# D_VI_1$variable_ranking = c(1:nrow(D_VI_1))
# selected_variables_1 = D_VI_1[1:selection_size,1]
# 
# set.seed(321)
# RF_model_2 = ranger(y=SNPdata[,1], x=SNPdata[,-1], num.trees=optRF_result_2$recommendation,
#                    write.forest = TRUE, importance="permutation")
# D_VI_2 = data.frame(variable = names(SNPdata)[-1],
#                     importance = RF_model_2$variable.importance)
# D_VI_2 = D_VI_2[order(D_VI_2$importance, decreasing=TRUE),]
# D_VI_2$variable_ranking = c(1:nrow(D_VI_2))
# selected_variables_2 = D_VI_2[1:selection_size,1]

## -----------------------------------------------------------------------------
sum(selected_variables_1 %in% selected_variables_2)

## ----fig.width=6, fig.height=4.5, fig.align='center'--------------------------
plot_stability(optRF_result_2, measure="selection", 
               0, 100000, add_recommendation = FALSE, ylim=c(0, 1))
abline(h=1, col="red")

## ----message=FALSE------------------------------------------------------------
estimate_numtrees(optRF_result_2, measure="selection", for_stability = 0.9)
estimate_numtrees(optRF_result_2, measure="selection", for_stability = 0.95)
estimate_numtrees(optRF_result_2, measure="selection", for_stability = 0.99)
estimate_numtrees(optRF_result_2, measure="selection", for_stability = 0.999)

## ----message=FALSE------------------------------------------------------------
estimate_stability(optRF_result_2, with_num.trees=500000)

## ----echo=FALSE, message=FALSE------------------------------------------------
load("opt_importance_vignette_stabilityData.Rda")

## ----eval=FALSE, message=FALSE------------------------------------------------
# set.seed(123)
# stability_imp_500 = measure_stability(y = Training[,1], X=Training[,-1], X_Test=Test, num.trees=500, method="importance")

## ----message=FALSE------------------------------------------------------------
stability_imp_500

## ----eval=FALSE, message=FALSE------------------------------------------------
# set.seed(123)
# stability_imp_10000 = measure_stability(y = Training[,1], X=Training[,-1], X_Test=Test, num.trees=10000, method="importance")

## ----message=FALSE------------------------------------------------------------
stability_imp_10000