## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) library(dplyr) library(kableExtra) library(ROCit) ## ----load package and data, echo=FALSE---------------------------------------- library(Rprofet) data = load(file="lending_club.rda") data = lending_club rm(lending_club) ## ----data-preview, echo=FALSE------------------------------------------------- #dim(data) #head(data[1:10]) kableExtra::kbl(head(data[,1:10]), caption = "Head of first 10 columns of lending club data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down")) #splitting data set.seed(112233) ##splitting data n <- nrow(data) split <- sample(c(rep(0, 0.8 * n), rep(1, 0.2 * n))) train <- data[split == 0, ] validation <- data[split == 1, ] tb = data.frame(Dataset=c('Original','Training','Validation'), Rows=0, Columns=0, "TargetRate"=0) tb[1,2:3] = dim(data) tb[2,2:3] = dim(train) tb[3,2:3] = dim(validation) tb[1,4] = round(table(data$bad)[2]/table(data$bad)[1],3) tb[2,4] = round(table(train$bad)[2]/table(train$bad)[1],3) tb[3,4] = round(table(validation$bad)[2]/table(validation$bad)[1],3) ## ----data-set-summary, echo=FALSE--------------------------------------------- #datasets summary kableExtra::kbl(tb, caption = "Dimension of datasets", booktabs = T, linesep = "") %>% kable_styling(latex_options = "striped") ## ----var-bf-bin, fig.cap = "Original Distribution of Month since recent inquiry", fig.height=3, fig.width=4.5, echo=FALSE, warning=FALSE---- ggplot2::ggplot(data = train, ggplot2::aes(x = mths_since_recent_inq)) + ggplot2::geom_histogram(binwidth=1, color="white", fill = "#0066CC") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))+ ggplot2::scale_x_continuous(breaks = seq(0, 25, 5)) ## ----coarse-binning----------------------------------------------------------- binData = BinProfet(data = train, id = "ID", target = "bad", num.bins = 10, min.pts.bin = 200) ## ----head-of-binned-data, echo=FALSE------------------------------------------ kableExtra::kbl(head(binData[1:10]), caption = "Head of first 10 columns of the binned data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) ## ----var-af-bin, fig.cap = "Distribution of binned Months since recent inquiry", fig.height=3, fig.width=4.5, echo=FALSE, warning=FALSE---- ggplot2::ggplot(data = binData, ggplot2::aes(x = mths_since_recent_inq_Bins)) + ggplot2::geom_bar(color="white", fill = "#0066CC", width=0.7) + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) ## ----binned-var-ex, echo=FALSE, include=FALSE--------------------------------- kableExtra::kbl(table(binData$mths_since_recent_inq_Bins), col.names = c("Variable Bins", "Frequency"), caption = "Frequency of each bin for annual income", booktabs = T, linesep = "") %>% kable_styling(latex_options = "striped") ## ----WoePlot, fig.cap = "WOE transformed Months since recent inquiry for each bin", fig.height=3.5, fig.width=5, message=FALSE, echo=FALSE---- WOEdata = WOEProfet(data = binData, id = "ID", target = "bad") data = WOEdata$vars$mths_since_recent_inq data = data[,c(1,2)] ggplot2::ggplot(data = data, ggplot2::aes(x = mths_since_recent_inq_Bins, y = mths_since_recent_inq_WOE)) + ggplot2::geom_bar(stat = "identity", fill = "#0066CC", width=0.7) + ggplot2::geom_text(ggplot2::aes(label = round(mths_since_recent_inq_WOE,3)), vjust = 1.1, size = 3, colour = "black") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ggplot2::labs(x = "", y = "WOE") ## ----WOE-transformation, message=FALSE, eval=FALSE---------------------------- # WOEdata = WOEProfet(data = binData, id = "ID", target = "bad") ## ----eval=FALSE--------------------------------------------------------------- # head(WOEdata$IV[order(-WOEdata$IV$IV),],10) ## ----WOE-ex, echo=FALSE------------------------------------------------------- kableExtra::kbl(head(WOEdata$IV[order(-WOEdata$IV$IV),],10), caption = "Top 10 variables with the highest IV", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "hold_position")) ## ----vars-in-WOEProfet, echo=FALSE-------------------------------------------- kableExtra::kbl(WOEdata$vars$verification_status, caption = "vars output for Verification Status variable", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","hold_position")) ## ----filter-WOE-object-by-IV, message=FALSE----------------------------------- subWOEdata = Var_select(WOEdata, "ID", "bad", IVfilter = 0.02) ## ----IVfilter-tb, echo=FALSE-------------------------------------------------- kableExtra::kbl(subWOEdata$IV, caption = "Filtered variables with IV greater than 0.02", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","hold_position")) ## ----IVfilter-tb2, echo=FALSE, eval=FALSE------------------------------------- # # # table on the left # # t1 = subWOEdata$IV[1:15,] # # # table on the right # # t2 = subWOEdata$IV[16:29,] # # # # kableExtra::kable(list(t1,t2), # # caption = "Filtered variables with IV greater than 0.02", # # booktabs = T, # # linesep = "") %>% # # kable_styling(latex_options = c("striped","hold_position")) ## ----data-ex, echo=FALSE, fig.cap="WOE transformed dataset A transpose", out.width = '70%', fig.align='center'---- knitr::include_graphics("datawoe.jpg") ## ----K-means-clustering------------------------------------------------------- set.seed(4172018) km_cluster = WOEclust_kmeans(subWOEdata, "ID", "bad", num_clusts = 10) ## ----Kmeans, echo=FALSE------------------------------------------------------- kableExtra::kbl(head(km_cluster,10), caption = "Head of first 10 rows of clustered variables", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","hold_position")) ## ----extract the top n variables from each cluster, message=FALSE------------- top2_km <- km_cluster %>% group_by(Group) %>% top_n(n = 2) ## ----selectedVars, echo=FALSE------------------------------------------------- kableExtra::kbl(top2_km, caption = "Variables selected by K-means clustering", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","hold_position")) ## ----hierarchical clusterng, message=FALSE------------------------------------ h_cluster <- WOEclust_hclust(subWOEdata, 'ID', 'bad', num_clusts = 10) top2_h <- h_cluster %>% group_by(Group) %>% top_n(n = 2) ## ----selectedVars2, echo=FALSE------------------------------------------------ kableExtra::kbl(top2_h, caption = "Variables selected by hierarchical clustering", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","hold_position")) ## ----WOEPlotter, fig.cap = "WOEplotter Output", fig.height=3.5, fig.width=6.5---- WOEplotter(binData, 'bad', 'acc_open_past_24mths_Bins') ## ----custom-num, fig.cap = "Customized Binning for Numeric Variable ", fig.height=3.5, fig.width=6.5---- result <- WOE_customNum(train, "acc_open_past_24mths", "ID", "bad", breaks = c(0,2,3,4,5,6,7,9,Inf), plot = T) ## ----WOEPlotter2, fig.cap = "WOEplotter Output", fig.height=3.5, fig.width=6.5---- WOEplotter(binData, "bad", "home_ownership_Bins") ## ----custom-char, fig.cap = "Customized Binning for Factor Variable ", fig.height=3.5, fig.width=6.5---- WOE = WOE_customFac(train, "home_ownership", "ID", "bad", new_levels = c(1,2,2) ,plot = T) ## ----ex-fine-turning---------------------------------------------------------- #acc_open_past_24mths #WOEplotter(binData, "bad", 'acc_open_past_24mths_Bins') acc_open_past_24mths_new <- WOE_customNum(train, "acc_open_past_24mths", "ID", "bad", breaks = c(0,2,3,4,5,6,7,9,Inf) )$NewBin #dti #WOEplotter(binData, 'bad', "dti_Bins") dti_new <- WOE_customNum(train, "dti", "ID", 'bad', breaks = c(0,11,16,20,25,30,Inf))$NewBin #num_tl_op_past_12m #WOEplotter(binData, "bad", 'num_tl_op_past_12m_Bins') num_tl_op_past_12m_new <- WOE_customNum(train, "num_tl_op_past_12m", "ID", "bad", breaks = c(0,1,2,3,4,5,8,Inf), )$NewBin ## ----fine-tuning-the-selected-variables, echo=FALSE--------------------------- #using the variables selected from K-Means Clustering and fine-tuning if needed #selectedVars # first 3 vars are commented out since the chunk above already shows them as examples # #acc_open_past_24mths # #WOEplotter(binData, "bad", 'acc_open_past_24mths_Bins') # acc_open_past_24mths_new <- WOE_customNum(train, "acc_open_past_24mths", # "ID", "bad", # breaks = c(0,2,3,4,5,6,7,9,Inf))$NewBin # #dti # #WOEplotter(binData, 'bad', "dti_Bins") # dti_new <- WOE_customNum(train, "dti", "ID", 'bad', # breaks = c(0,11,16,20,25,30,Inf))$NewBin # #num_tl_op_past_12m # #WOEplotter_New(binData, "bad", 'num_tl_op_past_12m_Bins') # num_tl_op_past_12m_new <- WOE_customNum(train, "num_tl_op_past_12m", "ID", # "bad", # breaks = c(0,1,2,3,4,5,8,Inf),)$NewBin #inq_last_6mths #WOEplotter(binData, "bad", 'inq_last_6mths_Bins') inq_last_6mths_new <- WOE_customNum(train, "inq_last_6mths", "ID", "bad", breaks = c(0,1,2,3,Inf),)$NewBin #total_bc_limit #WOEplotter(binData, target = 'bad', var = "total_bc_limit_Bins") total_bc_limit_new <- WOE_customNum(train, 'total_bc_limit', "ID", 'bad', breaks = c(0,3600,7500,11500,16500, 23900,32400,49000,Inf))$NewBin #bc_open_to_buy #WOEplotter(binData, 'bad', 'bc_open_to_buy_Bins') bc_open_to_buy_new <- WOE_customNum(train, "bc_open_to_buy", "ID", 'bad', breaks = c(0,330,900,1750,2810,4250, 6600,9500,14500,25300,Inf))$NewBin #mths_since_recent_bc #WOEplotter(binData, target = 'bad', var = "mths_since_recent_bc_Bins") mths_since_recent_bc_new <- WOE_customNum(train, 'mths_since_recent_bc', "ID", 'bad', breaks = c(0,4,8,14,19,27,42,84, Inf))$NewBin #mths_since_recent_inq #WOEplotter(binData, "bad", 'mths_since_recent_inq_Bins') mths_since_recent_inq_new <- WOE_customNum(train, "mths_since_recent_inq", "ID", "bad", breaks = c(0,1,2,4,7,10,12,15, 22,Inf))$NewBin #mo_sin_rcnt_tl #WOEplotter(binData, target = 'bad', var = "mo_sin_rcnt_tl_Bins") mo_sin_rcnt_tl_new <- WOE_customNum(train, 'mo_sin_rcnt_tl', "ID", 'bad', breaks = c(0,1,2,3,5,8,10,14,24,Inf))$NewBin #tot_hi_cred_lim #WOEplotter(binData, 'bad', "tot_hi_cred_lim_Bins") tot_hi_cred_lim_new <- WOE_customNum(train, "tot_hi_cred_lim", "ID", 'bad', breaks = c(2500,28000,42000,61000, 93200,132000,220000, 330000,Inf))$NewBin #annual_inc #WOEplotter(binData, "bad", "annual_inc_Bins") annual_inc_new <- WOE_customNum(train, "annual_inc", "ID", 'bad', breaks = c(6000,32100,40000,48500,55000, 65100,75000,90000,125000,160000, Inf))$NewBin #verification_status #WOEplotter(binData, "bad", "verification_status_Bins") verification_status_new <- binData[,c("ID", "verification_status_Bins")] #emp_length #WOEplotter(binData, 'bad', "emp_length_Bins") emp_length_new <- WOE_customFac(train, "emp_length", "ID", "bad", new_levels = c(1,1,4,1,2,2,2,3,3,3,3,5))$NewBin #bc_util #WOEplotter(binData, 'bad', "bc_util_Bins") bc_util_new <- WOE_customNum(train, "bc_util", "ID", 'bad', breaks = c(0,17,34,55,65,74,89,96,Inf))$NewBin #percent_bc_gt_75 #WOEplotter(binData, 'bad', "percent_bc_gt_75_Bins") percent_bc_gt_75_new <- WOE_customNum(train, "percent_bc_gt_75", "ID", 'bad', breaks = c(0,2,21,37,55,69,81,Inf))$NewBin #open_il_12m #WOEplotter(binData, 'bad', "open_il_12m_Bins") open_il_12m_new <- WOE_customNum(train, "open_il_12m", "ID", "bad", breaks = c(0,1,2,Inf))$NewBin #mths_since_rcnt_il #WOEplotter(binData, "bad", "mths_since_rcnt_il_Bins") mths_since_rcnt_il_new <- WOE_customNum(train, "mths_since_rcnt_il", "ID", "bad", breaks = c(1,7,14,30,Inf))$NewBin # mths_since_rcnt_il_new <- WOE_customNum(train, "mths_since_rcnt_il", # "ID", "bad", # breaks = c(1,Inf))$NewBin ## ----create-new-binned-data-with-fine-tuned-selected-variables---------------- #get the id and target variables from the original binned data binData_id_tar <- data.frame(binData[,c(1,2)]) #extract the selected variable names vars_name = top2_km$Variable #rename the selected variable names (we recommend naming the variables in a #consistent way, like what we did in the chuck above) vars = gsub("_WOE","_new",vars_name) #write a function to get new variable names Fun2 <- function(vars) { get(vars) } #obtain a list of lists, each sub-list contains a dataframe with fine-tuned #variables and ID variable var_list = lapply(vars, Fun2) #left join each sub-list from the list above on ID variable binData_new <- binData_id_tar %>% left_join(var_list %>% purrr::reduce(left_join, by='ID'), by="ID") ## ----recalculate-the-WOE-after-fine-tuning, message=FALSE, results="hide"----- WOEdata_new = WOEProfet(binData_new, "ID", "bad") ## ----dti, fig.cap = "WOE and Target Rate for each attribute of dti", fig.height=3.1, fig.width=7, echo=FALSE---- data = WOEdata_new$vars$dti a = ggplot2::ggplot(data = data, ggplot2::aes(x = dti_Bins, y = dti_WOE)) + ggplot2::geom_bar(stat = "identity", fill = "#0066CC", width=0.7) + ggplot2::geom_text(ggplot2::aes(label = round(dti_WOE,3)), vjust = 1.1, size = 2.8, colour = "black") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ggplot2::labs(x = "", y = "WOE") b = ggplot2::ggplot(data = data, ggplot2::aes(x = dti_Bins, y = TargetRate)) + ggplot2::geom_bar(stat = "identity", fill = "#0066CC", width=0.7) + ggplot2::geom_text(ggplot2::aes(label = round(TargetRate,3)), vjust = 1.1, size = 2.8, colour = "black") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ggplot2::labs(x = "", y = "Target Rate") c <- gridExtra::grid.arrange(a, b, ncol = 2, nrow = 1) ## ----bc-util, fig.cap = "WOE and Target Rate for each attribute of bc_util ", fig.height=4, fig.width=9, echo=FALSE---- data = WOEdata_new$vars$bc_util a = ggplot2::ggplot(data = data, ggplot2::aes(x = bc_util_Bins, y = bc_util_WOE)) + ggplot2::geom_bar(stat = "identity", fill = "#0066CC", width=0.7) + ggplot2::geom_text(ggplot2::aes(label = round(bc_util_WOE,3)), vjust = 1.1, size = 2.8, colour = "black") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ggplot2::labs(x = "", y = "WOE") b = ggplot2::ggplot(data = data, ggplot2::aes(x = bc_util_Bins, y = TargetRate)) + ggplot2::geom_bar(stat = "identity", fill = "#0066CC", width=0.7) + ggplot2::geom_text(ggplot2::aes(label = round(TargetRate,3)), vjust = 1.1, size = 2.8, colour = "black") + ggplot2::theme(legend.position = "none", axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ggplot2::labs(x = "", y = "Target Rate") c <- gridExtra::grid.arrange(a, b, ncol = 2, nrow = 1) ## ----WOE-glm, fig.cap = "Genearl linear models with WOE variables", fig.height=5, fig.width=9, echo=FALSE---- par(mfrow = c(1,2)) acc = setNames(aggregate(WOEdata_new$WOE$bad ~ WOEdata_new$WOE$dti_WOE, FUN = mean), c("dti_WOE","bad")) acc$logit_p = log(acc$bad/(1-acc$bad)) plot(acc$dti_WOE, acc$logit_p, xlab="WOE", ylab="log-odds of Target Rate") md_acc = glm(bad ~ dti_WOE, data = WOEdata_new$WOE, family = "binomial") abline(md_acc$coefficients, col="red", lwd = 1.5) #abline(h=md_acc$coefficients[1]) msri = setNames(aggregate(WOEdata_new$WOE$bad ~ WOEdata_new$WOE$bc_util_WOE, FUN = mean), c("bc_util_WOE","bad")) #bad rate for each bin's woe msri$logit_p = log(msri$bad/(1-msri$bad)) #logit(p) #md_msri = glm(bad ~ bc_util_WOE, data = WOEdata_new$WOE, family = "binomial") #plot(test$`WOEdata_new$WOE$bc_util_WOE`, test$logit_p) points(msri$bc_util_WOE, msri$logit_p, pch=2) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) legend("bottomright", legend=c("Model with dti_WOE", "Model with bc_util_WOE"), pch = c(1,2), cex=0.7) md_2v = glm(bad ~ dti_WOE+bc_util_WOE, data=WOEdata_new$WOE, family = "binomial") plot(acc$dti_WOE, acc$logit_p, xlab="WOE", ylab="log-odds of Target Rate") points(msri$bc_util_WOE, msri$logit_p, pch=2) abline(md_acc$coefficients, col="red", lwd = 1.5) abline(md_2v$coefficients[1], md_2v$coefficients[2], col="orange2", lwd = 1.5) abline(md_2v$coefficients[1], md_2v$coefficients[3], col="darkgreen", lwd = 1.5) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) legend("bottomright", title="Model:", legend=c("with 1 WOE variable", "with 2 variable: dti_WOE", "with 2 variable: bc_util_WOE"), col=c("red", "orange2", "darkgreen"), lty=1, cex=0.7) ## ----further-variable-selection-using-stepwise-algorithm---------------------- fullModel = glm(bad~., data = WOEdata_new$WOE[,-1], family = "binomial") stepModel = step(fullModel, direction = "both", trace = F) summary(stepModel) ## ----corrplot, echo=FALSE, fig.cap = "Correlation plot for highly correlated variables", fig.height=4, fig.width=5---- dat = Var_select(WOEdata_new, "ID", "bad", varcol = -c(5,15,16)) #vars in the reduced model by stepwise selection df_cor <- dat$WOE[,-c(1,2)] #remove id and target columns corr <- cor(df_cor) corr[lower.tri(corr,diag=TRUE)] <- NA #Prepare to drop duplicates and correlations of 1 corr[corr == 1] <- NA #drop perfect correlations corr <- as.data.frame(as.table(corr)) #Turn into a 3-column table corr <- na.omit(corr) #remove the NA values from above corr <- subset(corr, abs(Freq) > 0.3) #select significant values corr <- corr[order(-abs(corr$Freq)),] #Sort by highest correlation #turn corr back into matrix in order to plot with corrplot mtx_corr <- reshape2::acast(corr, Var1~Var2, value.var="Freq") #plot correlations visually corrplot::corrplot(mtx_corr, is.corr=FALSE, tl.col="black", na.label=" ", tl.cex = 0.7, addCoef.col = 1, number.cex = 0.8, tl.srt = 45) ## ----remove-negative-coefficient---------------------------------------------- selected_var = Var_select(WOEdata_new, "ID", "bad", varcol = c("acc_open_past_24mths_Bins","num_tl_op_past_12m_Bins", "mths_since_recent_inq_Bins","bc_open_to_buy_Bins", "mths_since_recent_bc_Bins","dti_Bins", "tot_hi_cred_lim_Bins", "annual_inc_Bins", "inq_last_6mths_Bins","verification_status_Bins", "bc_util_Bins","emp_length_Bins", "mths_since_rcnt_il_Bins")) new_Model = glm(bad~., data = selected_var$WOE[,-1], family = "binomial") summary(new_Model) ## ----create-scorecard, message=FALSE------------------------------------------ scorecard = ScorecardProfet(selected_var, id = 'ID', target = 'bad', GLModel = new_Model, PDO = 200, BaseOdds = 10, BasePts = 1000, reverse = FALSE) ## ----final-scorecard, echo=FALSE---------------------------------------------- kableExtra::kbl(scorecard, caption = "Final Scorecard", longtable = T, booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped","repeat_header")) ## ----valid-ex, fig.cap = "\\label{fig:valid_ex}Binning variable in the validation set", fig.height=3, fig.width=6.5---- (names <- unique(scorecard$Attribute)) valid <- validation %>% dplyr::select(ID, bad, all_of(names)) acc_open_past_24mths = WOE_customNum(valid, "acc_open_past_24mths", "ID", "bad", breaks = c(0,2,3,4,5,6,7,9,Inf), right_bracket = F, plot = T)$NewBin ## ----bin-the-validation-data-in-the-same-manner, include=FALSE---------------- #get the predictor names in the final scorecard (names <- unique(scorecard$Attribute)) #get a subset of validation set with variables on the scorecard valid <- validation %>% dplyr::select(ID, bad, all_of(names)) #bin each variable in the same way based on scorecard #simply copy the codes from the fune-tuning chunk and rplace train with valid #acc_open_past_24mths acc_open_past_24mths <- WOE_customNum(valid, "acc_open_past_24mths", "ID", "bad", breaks = c(0,2,3,4,5,6,7,9,Inf) )$NewBin #dti dti <- WOE_customNum(valid, "dti", "ID", 'bad', breaks = c(0,11,16,20,25,30,Inf))$NewBin #num_tl_op_past_12m num_tl_op_past_12m <- WOE_customNum(valid, "num_tl_op_past_12m", "ID", "bad", breaks = c(0,1,2,3,4,5,8,Inf) )$NewBin #inq_last_6mths inq_last_6mths <- WOE_customNum(valid, "inq_last_6mths", "ID", "bad", breaks = c(0,1,2,3,Inf))$NewBin #bc_open_to_buy bc_open_to_buy <- WOE_customNum(valid, "bc_open_to_buy", "ID", 'bad', breaks = c(0,330,900,1750,2810,4250, 6600,9500,14500,25300,Inf) )$NewBin #mths_since_recent_bc mths_since_recent_bc <- WOE_customNum(valid, 'mths_since_recent_bc', "ID", 'bad', breaks = c(0,4,8,14,19,27,42,84, Inf))$NewBin #mths_since_recent_inq mths_since_recent_inq <- WOE_customNum(valid, "mths_since_recent_inq", "ID", "bad", breaks = c(0,1,2,4,7,10,12,15, 22,Inf))$NewBin #tot_hi_cred_lim tot_hi_cred_lim <- WOE_customNum(valid, "tot_hi_cred_lim", "ID", 'bad', breaks = c(2500,28000,42000,61000, 93200,132000,220000, 330000,Inf))$NewBin #annual_inc annual_inc <- WOE_customNum(valid, "annual_inc", "ID", 'bad', breaks = c(6000,32100,40000,48500,55000, 65100,75000,90000,125000,160000, Inf))$NewBin #verification_status -- this variable stays the same verification_status <- valid[,c("ID", "verification_status")] #emp_length emp_length <- WOE_customFac(valid, "emp_length", "ID", "bad", new_levels = c(1,1,4,1,2,2,2,3,3,3,3,5) )$NewBin #bc_util bc_util <- WOE_customNum(valid, "bc_util", "ID", 'bad', breaks = c(0,17,34,55,65,74,89,96,Inf))$NewBin #mths_since_rcnt_il mths_since_rcnt_il <- WOE_customNum(valid, "mths_since_rcnt_il", "ID", "bad", breaks = c(1,7,14,30,Inf))$NewBin #obtain a list of lists, each sub-list contains a dataframe with fine-tuned #variables and ID variable test = lapply(names, Fun2) #left join each sub-list from the list above on ID variable binData_valid <- valid[,c("ID","bad")] %>% left_join(test %>% purrr::reduce(left_join, by='ID'), by="ID") ## ----binning-validation-ex---------------------------------------------------- colnames(binData_valid) <- gsub("_Bins","", colnames(binData_valid)) valid_score <- ScoreDataProfet(binData_valid, scorecard, "ID", "bad") ## ----scored-data, echo=FALSE-------------------------------------------------- vs = head(valid_score,4) kableExtra::kbl(vs[,c(1:7)], caption = "Head of the scored validation data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) kableExtra::kbl(vs[,c(8:15)], #caption = "Head of the scored validation data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) kableExtra::kbl(vs[,c(16:22)], #caption = "Head of the scored validation data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) kableExtra::kbl(vs[,c(23:29)], #caption = "Head of the scored validation data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) ## ----print-wide-table, results="asis", echo=FALSE, eval=FALSE----------------- # #this doesn't look better than simply using head() # test = head(valid_score,5) # m <- matrix(1:ncol(test), 5) # # for (i in 1:ncol(m)) { # cat(kable(test[, m[, i]], 'latex', booktabs=TRUE), "\\newline") # } ## ----gains-tb-train, echo=FALSE----------------------------------------------- #ROCit #training set gains table colnames(selected_var$Bin) <- gsub("_Bins","", colnames(selected_var$Bin)) train_score = ScoreDataProfet(selected_var$Bin, scorecard, "ID", "bad") gain_train <- gainstable(score = -train_score$Score, class = train_score$bad, negref = 0, ngroup = 10) #validation set gains table gain_valid <- gainstable(score = -valid_score$Score, class = valid_score$bad, negref = 0, ngroup = 10) # gain_train # gain_valid ## 1.Calculate the Average Points ##-----------------------------##-----------------------------## ## Training Set os = sort(train_score$Score, decreasing = T) obs = length(os)/10 b = seq(obs, length(os), obs) n = ntile(desc(os),10) #table(n) tb = data.frame(os, n) avg_pt_t = tb %>% group_by(n) %>% summarise(Avg_point = mean(os)) #avg_pt_t #-------------------------------------------------------------------# ## Validation Set os = sort(valid_score$Score, decreasing = T) obs = length(os)/10 b = seq(obs, length(os), obs) n = ntile(desc(os),10) #table(n) tb = data.frame(os, n) avg_pt_v = tb %>% group_by(n) %>% summarise(Avg_point = mean(os)) #avg_pt_v ##-----------------------------##-----------------------------## ## 2.Add the Avg Points column to the gains tables #gain_train train_gtb = as.data.frame(matrix(unlist(gain_train), nrow=length(unlist(gain_train[1])))) nm = names(gain_train) names(train_gtb) = nm train_gtb$`Avg Score` = round(avg_pt_t$Avg_point, 0) kableExtra::kbl(train_gtb, caption = "Gains table for the training data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) ## ----gains-tb-test, echo=FALSE------------------------------------------------ #gain_valid valid_gtb = as.data.frame(matrix(unlist(gain_valid), nrow=length(unlist(gain_valid[1])))) nm = names(gain_valid) names(valid_gtb) = nm valid_gtb$`Avg Score` = round(avg_pt_v$Avg_point, 0) kableExtra::kbl(valid_gtb, caption = "Gains table for the validation data", booktabs = T, linesep = "") %>% kable_styling(latex_options = c("striped", "scale_down", "hold_position")) ## ----gains, fig.cap = "Gains Charts", echo=FALSE, fig.height=4, fig.width=9---- par(mfrow = c(1, 2)) train.depth = c(0, (gain_train$Bucket)*10) train.gaincurve = c(0,(gain_train$CCapRate)*100) plot(train.depth,train.gaincurve,col='cyan3',pch=16,xlim=c(0,100),ylim=c(0,100), cex.main=0.8, main='Gains Chart for Training Set', xlab='Percentile',ylab='% of Total Response') grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) lines(train.depth,train.gaincurve,col='cyan3',lty=1) axis(1,at=c(10,30,50,70,90)) points(train.depth,train.depth,col='red',pch=16) lines(train.depth,train.depth,col='red',lty=1) legend('topleft', legend=c('Logistic Model','Baseline'), col=c('cyan3','red'), pch=16,lty=1, cex=0.7) valid.depth = c(0, (gain_valid$Bucket)*10) valid.gaincurve = c(0,(gain_valid$CCapRate)*100) plot(valid.depth,valid.gaincurve,col='cyan3',pch=16,xlim=c(0,100),ylim=c(0,100), cex.main=0.8, main='Gains Chart for Validation Set', xlab='Percentile',ylab='% of Total Response') grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) lines(valid.depth,valid.gaincurve,col='cyan3',lty=1) axis(1,at=c(10,30,50,70,90)) points(valid.depth,valid.depth,col='red',pch=16) lines(valid.depth,valid.depth,col='red',lty=1) legend('topleft', legend=c('Logistic Model','Baseline'), col=c('cyan3','red'), pch=16,lty=1, cex=0.7) ## ----confusion-m, echo=FALSE, fig.cap="Confusion Matrix", out.width = '40%', fig.align='center'---- knitr::include_graphics("confusion_m.jpg") ## ----model-roc, fig.cap = "ROC Curves for Training and Validation Sets", message=FALSE, fig.height=4, fig.width=9, echo=FALSE---- par(mfrow = c(1, 2)) #-----------------ROCit-------------------# # roc_train <- rocit(score = step_result$step_pred, class = step_result$bad, # negref = 0) # #plot(roc_train$FPR, roc_train$TPR, type='l') # plot(roc_train, YIndex = F, values = F, col = c(2,4)) # roc_train$AUC # gainstable(roc_train, breaks = seq(10,100,10)) roc_train <- rocit(score = -train_score$Score, class = train_score$bad, negref = 0) #plot(roc_train, YIndex = F, values = F, col = c(2,4)) #roc_train$AUC #Validation Set roc_valid <- rocit(score = -valid_score$Score, class = valid_score$bad, negref = 0) #plot(roc_valid, YIndex = F, values = F, col = c(2,4)) #roc_valid$AUC #create my own ROC plots plot(roc_train$FPR, roc_train$TPR, type = 'l', col='red', lwd=2, xlab = '1-Specificity (FPR)', ylab = 'Sensitivity (TPR)', main = 'ROC Curve for the Training Set', cex.main=0.8) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) abline(0,1,lty=2, col='deepskyblue') legend('bottomright', legend=c('Empirical ROC curve', 'Baseline'), col=c('red','deepskyblue'), pch=16,lty=1,cex=0.7) plot(roc_valid$FPR, roc_valid$TPR, type = 'l', col='red', lwd=2, xlab = '1-Specificity (FPR)', ylab = 'Sensitivity (TPR)', main = 'ROC Curve for the Validation Set', cex.main=0.8,) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) abline(0,1,lty=2, col='deepskyblue') legend('bottomright', legend=c('Empirical ROC curve', 'Baseline'), col=c('red','deepskyblue'), pch=16,lty=1,cex=0.7) ## ----roc, fig.cap = "ROC Curves", fig.height=4, fig.width=5, echo=FALSE------- plot(roc_train$FPR, roc_train$TPR, type='l', main="ROC Curves Comparison", xlab="FPR", ylab="TPR", lwd = 2, cex.main=0.8) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) segments(x0 = 0, y0 = 0, x1 = 0, y1 = 1, col = "darkgreen", lwd = 2) segments(x0 = 0, y0 = 1, x1 = 1, y1 = 1, col = "darkgreen", lwd = 2) segments(x0 = 0, y0 = 0, x1 = 1, y1 = 1, col = "red", lwd = 2) #plot(roc_valid$FPR, roc_valid$TPR, type='l', add=T) lines(roc_valid$FPR, roc_valid$TPR, col='orange', lwd = 2) legend("bottomright", legend=c("perfect","random guess","training model", "validation model"), col=c("darkgreen","red","black","orange"), lty=1, cex=0.7, lwd = 2) ## ----KS, fig.cap = "KS Plots", echo=FALSE, fig.height=4, fig.width=9---------- #ROCit---KS plots and KS statistics #ks_train = ksplot(roc_train) #ks_train$`KS stat` #ks_valid = ksplot(roc_valid) #ks_valid$`KS stat` #make my own KS plots par(mfrow = c(1, 2)) measure.t <- measureit(score = -train_score$Score, class = train_score$bad, negref = 0, measure = c("ACC", "SENS", "TPR", "FPR")) depth <- measure.t$Depth plot(depth, measure.t$TPR, type='l', xlab='Percentile', ylab='', main='Training set K-S plot', col='darkorange2', lwd = 2, cex.main=0.8) lines(depth, measure.t$FPR, col='navy', lwd = 2) #which.max(abs(measure.t$TPR-measure.t$FPR)) #486 segments(x0 = depth[486], y0 = measure.t$FPR[486], x1 = depth[486], y1 = measure.t$TPR[486], col = "deeppink", lwd = 2) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) legend('bottomright', legend=c('True Positive Rate','False Positive Rate', 'KS Stat'), col=c('darkorange2','navy', 'deeppink'), pch=16,lty=1, cex=0.7, lwd = 2) ks_train = max(abs(measure.t$TPR-measure.t$FPR)) #KS statistics measure.v <- measureit(score = -valid_score$Score, class = valid_score$bad, negref = 0, measure = c("ACC", "SENS", "TPR", "FPR")) depth <- measure.v$Depth plot(depth, measure.v$TPR, type='l', xlab='Percentile', ylab='', main='Validation set K-S plot', col='darkorange2', lwd = 2, cex.main=0.8) lines(depth, measure.v$FPR, col='navy', lwd = 2) #which.max(abs(measure.v$TPR-measure.v$FPR)) #452 segments(x0 = depth[452], y0 = measure.v$FPR[452], x1 = depth[452], y1 = measure.v$TPR[452], col = "deeppink", lwd = 2) grid(nx = NULL, ny = NULL, lty = 2, col = "gray", lwd = 1) legend('bottomright', legend=c('True Positive Rate','False Positive Rate', 'KS Stat'), col=c('darkorange2','navy', 'deeppink'), pch=16,lty=1, cex=0.7, lwd = 2) ks_valid = max(abs(measure.v$TPR-measure.v$FPR))#KS statistics ## ----AUC-KS-tb, echo=FALSE---------------------------------------------------- tb = data.frame(Data=c('Training Data','Validation Data'), AUC=0, KS=0) tb[1,2] = round(roc_train$AUC, 3) tb[2,2] = round(roc_valid$AUC, 3) tb[1,3] = round(ks_train, 3) tb[2,3] = round(ks_valid, 3) kableExtra::kbl(tb, caption = "Fit statistics table", booktabs = T, linesep = "")