Machine learning: Boosting

Gradient Boosting in caret
load data
Stochastic Gradient Boosting
基于xgboost包的普通构建模型
基于xgboost包的调参建模 (一)
基于xgboost包的调参建模 (二)
基于caret包的建模
Summary ROC
Information
Reference

更多知识分享请到 https://zouhua.top/。

knitr::opts_chunk$set(warning = F, message = F)
library(dplyr)
library(tibble)
library(tidyverse)
library(ISLR)
library(caret)
library(pROC)
library(xgboost)

Gradient Boosting in caret

Model	method Value	Type	Libraries	Tuning Parameters
eXtreme Gradient Boosting	xgbDART	Classification, Regression	xgboost, plyr	nrounds, max_depth, eta, gamma, subsample, colsample_bytree, rate_drop, skip_drop, min_child_weight
eXtreme Gradient Boosting	xgbLinear	Classification, Regression	xgboost	nrounds, lambda, alpha, eta
eXtreme Gradient Boosting	xgbTree	Classification, Regression	xgboost, plyr	nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample
Gradient Boosting Machines	gbm_h2o	Classification, Regression	h2o	ntrees, max_depth, min_rows, learn_rate, col_sample_rate
Stochastic Gradient Boosting	gbm	Classification, Regression	gbm, plyr	n.trees, interaction.depth, shrinkage, n.minobsinnode

load data

ml_data <- College
ml_data %>%
  glimpse()
set.seed(123)
index <- createDataPartition(ml_data$Private, p = 0.7, list = FALSE)
train_data <- ml_data[index, ]
test_data  <- ml_data[-index, ]

Stochastic Gradient Boosting

# Train model with preprocessing & repeated cv
model_gbm <- caret::train(Private ~ .,
                          data = train_data,
                          method = "gbm",
                          preProcess = c("scale", "center"),
                          trControl = trainControl(method = "repeatedcv", 
                                                  number = 5, 
                                                  repeats = 3, 
                                                  verboseIter = FALSE),
                          verbose = 0)
model_gbm

caret::confusionMatrix(
  data = predict(model_gbm, test_data),
  reference = test_data$Private
  )
predict(model_gbm, test_data, type = "raw")   
predict(model_gbm, test_data, type = "prob")

rocobj0 <- roc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "No"])
auc0 <- round(auc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "Yes"]),4)
ggroc(rocobj0, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc0),
                       size = 5, family="serif")+
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

基于xgboost包的普通构建模型

xgboost_model <- xgboost(data = as.matrix(train_data[, -1]), 
                         label = as.numeric(train_data$Private)-1,
                         max_depth = 3, 
                         objective = "binary:logistic", 
                         nrounds = 10, 
                         verbose = FALSE,
                         prediction = TRUE,
                         eval_metric = "logloss")
xgboost_model

predict(xgboost_model, 
        as.matrix(test_data[, -1])) %>%
  as.tibble() %>%
  mutate(prediction = round(value),
         label = as.numeric(test_data$Private)-1) %>%
  count(prediction, label)

rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))
auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)
ggroc(rocobj1, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc1),
                       size = 5, family="serif")+
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图1

基于xgboost包的调参建模 (一)

dtrain <- xgb.DMatrix(as.matrix(train_data[, -1]), 
                      label = as.numeric(train_data$Private)-1)
dtest <- xgb.DMatrix(as.matrix(test_data[, -1]), 
                      label = as.numeric(test_data$Private)-1)
params <- list(max_depth = 3, 
               objective = "binary:logistic",
               silent = 0)
watchlist <- list(train = dtrain, eval = dtest)
bst_model <- xgb.train(params = params, 
                       data = dtrain, 
                       nrounds = 10, 
                       watchlist = watchlist,
                       verbose = FALSE,
                       prediction = TRUE,
                       eval_metric = "logloss")
bst_model

predict(bst_model, 
        as.matrix(test_data[, -1])) %>%
  as_tibble() %>%
  mutate(prediction = round(value),
         label = as.numeric(test_data$Private)-1) %>%
  count(prediction, label)

rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))
auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)
ggroc(rocobj2, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc2),
                       size = 5, family="serif")+
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图2

基于xgboost包的调参建模 (二)

cv_model <- xgb.cv(params = params,
                   data = dtrain, 
                   nrounds = 100, 
                   watchlist = watchlist,
                   nfold = 5,
                   verbose = FALSE,
                   prediction = TRUE,
                   eval_metric = "logloss") # prediction of cv folds
cv_model$evaluation_log %>%
  filter(test_logloss_mean == min(test_logloss_mean))
min_logloss <- min(cv_model$evaluation_log[, test_logloss_mean])
min_logloss_index <- which.min(cv_model$evaluation_log[, test_logloss_mean])
nround <- min_logloss_index
best_param <- params
bst_model_cv <- xgb.train(data=dtrain, params=best_param, nrounds=nround, nthread=6)
bst_model_cv

rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))
auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)
ggroc(rocobj3, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc3),
                       size = 5, family="serif")+
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图3

基于caret包的建模

X_train <- xgb.DMatrix(as.matrix(train_data %>% select(-Private)))
y_train <- train_data$Private
X_test <- xgb.DMatrix(as.matrix(test_data %>% select(-Private)))
y_test <- test_data$Private
xgb_trcontrol <- trainControl(method = "cv",
                              number = 5,  
                              allowParallel = TRUE,
                              verboseIter = FALSE,
                              returnData = FALSE)
xgbGrid <- expand.grid(nrounds = c(100, 200),
                       max_depth = c(10, 15, 20, 25),
                       colsample_bytree = seq(0.5, 0.9, length.out = 5),
                       eta = 0.1,
                       gamma = 0,
                       min_child_weight = 1,
                       subsample = 1)
set.seed(123) 
xgb_model_caret <- train(X_train, y_train,  
                   trControl = xgb_trcontrol,
                   tuneGrid = xgbGrid,
                   method = "xgbTree")
xgb_model_caret$bestTune

pred <- predict(xgb_model_caret, newdata=X_test)
print(confusionMatrix(pred, y_test))

rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])
auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)
ggroc(rocobj4, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc4),
                       size = 5, family="serif")+
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图4

Summary ROC

rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))
auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)
rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))
auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)
rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))
auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)
rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])
auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)
rocboj_list <- list(xgboost=rocobj1, 
                    best=rocobj2, 
                    best_cv=rocobj3, 
                    xgb_caret=rocobj4)
ggroc(rocboj_list, linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
                geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
              labs(x = "False Positive Rate (1 - Specificity)",
                   y = "True Positive Rate (Sensivity or Recall)")+
              annotate("text",x = .75, y = .25,label=paste("AUC =", auc1, "(xgboost_model)"),
                       size = 5, family="serif")+
              annotate("text",x = .75, y = .20,label=paste("AUC =", auc2, "(bst_model)"),
                       size = 5, family="serif")+
              annotate("text",x = .75, y = .15,label=paste("AUC =", auc3, "(bst_model_cv)"),
                       size = 5, family="serif")+
              annotate("text",x = .75, y = .10,label=paste("AUC =", auc4, "(xgb_model_caret)"),
                       size = 5, family="serif")+   
              coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
              scale_colour_manual(values = c("red", "blue", "black", "green"))+
              theme_bw()+
              theme(panel.background = element_rect(fill = 'transparent'),
                    axis.ticks.length = unit(0.4, "lines"), 
                    axis.ticks = element_line(color='black'),
                    axis.line = element_line(size=.5, colour = "black"),
                    axis.title = element_text(colour='black', size=12,face = "bold"),
                    axis.text = element_text(colour='black',size=10,face = "bold"),
                    text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图5

Information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    
system code page: 936
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] xgboost_1.3.1.1 pROC_1.16.2     caret_6.0-86    lattice_0.20-41 ISLR_1.2        forcats_0.5.0  
 [7] stringr_1.4.0   purrr_0.3.4     readr_1.4.0     tidyr_1.1.2     ggplot2_3.3.3   tidyverse_1.3.0
[13] tibble_3.0.3    dplyr_1.0.2    
loaded via a namespace (and not attached):
 [1] httr_1.4.2           jsonlite_1.7.1       splines_4.0.2        foreach_1.5.1        prodlim_2019.11.13  
 [6] modelr_0.1.8         assertthat_0.2.1     stats4_4.0.2         cellranger_1.1.0     yaml_2.2.1          
[11] ipred_0.9-9          pillar_1.4.6         backports_1.1.10     glue_1.4.2           digest_0.6.25       
[16] rvest_0.3.6          colorspace_1.4-1     recipes_0.1.15       gbm_2.1.8            htmltools_0.5.0     
[21] Matrix_1.3-2         plyr_1.8.6           timeDate_3043.102    pkgconfig_2.0.3      broom_0.7.3         
[26] haven_2.3.1          scales_1.1.1         gower_0.2.2          lava_1.6.8           farver_2.0.3        
[31] generics_0.1.0       ellipsis_0.3.1       withr_2.3.0          nnet_7.3-14          cli_2.1.0           
[36] survival_3.2-7       magrittr_1.5         crayon_1.3.4         readxl_1.3.1         evaluate_0.14       
[41] fs_1.5.0             fansi_0.4.1          nlme_3.1-150         MASS_7.3-53          xml2_1.3.2          
[46] class_7.3-17         rsconnect_0.8.16     tools_4.0.2          data.table_1.13.6    hms_0.5.3           
[51] lifecycle_0.2.0      munsell_0.5.0        reprex_0.3.0         compiler_4.0.2       e1071_1.7-4         
[56] rlang_0.4.7          grid_4.0.2           iterators_1.0.13     rstudioapi_0.11      labeling_0.4.2      
[61] rmarkdown_2.5        gtable_0.3.0         ModelMetrics_1.2.2.2 codetools_0.2-18     DBI_1.1.0           
[66] reshape2_1.4.4       R6_2.5.0             lubridate_1.7.9      knitr_1.30           utf8_1.1.4          
[71] stringi_1.5.3        Rcpp_1.0.5           vctrs_0.3.4          rpart_4.1-15         dbplyr_2.0.0        
[76] tidyselect_1.1.0     xfun_0.19

Gradient Boosting in caret

load data

Stochastic Gradient Boosting

基于xgboost包的普通构建模型

基于xgboost包的调参建模 (一)

基于xgboost包的调参建模 (二)

基于caret包的建模

Summary ROC

Information

Reference