更多知识分享请到 https://zouhua.top/

  1. knitr::opts_chunk$set(warning = F, message = F)
  2. library(dplyr)
  3. library(tibble)
  4. library(tidyverse)
  5. library(ISLR)
  6. library(caret)
  7. library(pROC)
  8. library(xgboost)

Gradient Boosting in caret

Model method Value Type Libraries Tuning Parameters
eXtreme Gradient Boosting xgbDART Classification, Regression xgboost, plyr nrounds, max_depth, eta, gamma, subsample, colsample_bytree, rate_drop, skip_drop, min_child_weight
eXtreme Gradient Boosting xgbLinear Classification, Regression xgboost nrounds, lambda, alpha, eta
eXtreme Gradient Boosting xgbTree Classification, Regression xgboost, plyr nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample
Gradient Boosting Machines gbm_h2o Classification, Regression h2o ntrees, max_depth, min_rows, learn_rate, col_sample_rate
Stochastic Gradient Boosting gbm Classification, Regression gbm, plyr n.trees, interaction.depth, shrinkage, n.minobsinnode

load data

  1. ml_data <- College
  2. ml_data %>%
  3. glimpse()
  4. set.seed(123)
  5. index <- createDataPartition(ml_data$Private, p = 0.7, list = FALSE)
  6. train_data <- ml_data[index, ]
  7. test_data <- ml_data[-index, ]

Stochastic Gradient Boosting

  1. # Train model with preprocessing & repeated cv
  2. model_gbm <- caret::train(Private ~ .,
  3. data = train_data,
  4. method = "gbm",
  5. preProcess = c("scale", "center"),
  6. trControl = trainControl(method = "repeatedcv",
  7. number = 5,
  8. repeats = 3,
  9. verboseIter = FALSE),
  10. verbose = 0)
  11. model_gbm
  1. caret::confusionMatrix(
  2. data = predict(model_gbm, test_data),
  3. reference = test_data$Private
  4. )
  5. predict(model_gbm, test_data, type = "raw")
  6. predict(model_gbm, test_data, type = "prob")
  • ROC
  1. rocobj0 <- roc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "No"])
  2. auc0 <- round(auc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "Yes"]),4)
  3. ggroc(rocobj0, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  4. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  5. labs(x = "False Positive Rate (1 - Specificity)",
  6. y = "True Positive Rate (Sensivity or Recall)")+
  7. annotate("text",x = .75, y = .25,label=paste("AUC =", auc0),
  8. size = 5, family="serif")+
  9. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  10. theme_bw()+
  11. theme(panel.background = element_rect(fill = 'transparent'),
  12. axis.ticks.length = unit(0.4, "lines"),
  13. axis.ticks = element_line(color='black'),
  14. axis.line = element_line(size=.5, colour = "black"),
  15. axis.title = element_text(colour='black', size=12,face = "bold"),
  16. axis.text = element_text(colour='black',size=10,face = "bold"),
  17. text = element_text(size=8, color="black", family="serif"))

基于xgboost包的普通构建模型

  1. xgboost_model <- xgboost(data = as.matrix(train_data[, -1]),
  2. label = as.numeric(train_data$Private)-1,
  3. max_depth = 3,
  4. objective = "binary:logistic",
  5. nrounds = 10,
  6. verbose = FALSE,
  7. prediction = TRUE,
  8. eval_metric = "logloss")
  9. xgboost_model
  1. predict(xgboost_model,
  2. as.matrix(test_data[, -1])) %>%
  3. as.tibble() %>%
  4. mutate(prediction = round(value),
  5. label = as.numeric(test_data$Private)-1) %>%
  6. count(prediction, label)
  1. rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))
  2. auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)
  3. ggroc(rocobj1, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  4. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  5. labs(x = "False Positive Rate (1 - Specificity)",
  6. y = "True Positive Rate (Sensivity or Recall)")+
  7. annotate("text",x = .75, y = .25,label=paste("AUC =", auc1),
  8. size = 5, family="serif")+
  9. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  10. theme_bw()+
  11. theme(panel.background = element_rect(fill = 'transparent'),
  12. axis.ticks.length = unit(0.4, "lines"),
  13. axis.ticks = element_line(color='black'),
  14. axis.line = element_line(size=.5, colour = "black"),
  15. axis.title = element_text(colour='black', size=12,face = "bold"),
  16. axis.text = element_text(colour='black',size=10,face = "bold"),
  17. text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图1

基于xgboost包的调参建模 (一)

  1. dtrain <- xgb.DMatrix(as.matrix(train_data[, -1]),
  2. label = as.numeric(train_data$Private)-1)
  3. dtest <- xgb.DMatrix(as.matrix(test_data[, -1]),
  4. label = as.numeric(test_data$Private)-1)
  5. params <- list(max_depth = 3,
  6. objective = "binary:logistic",
  7. silent = 0)
  8. watchlist <- list(train = dtrain, eval = dtest)
  9. bst_model <- xgb.train(params = params,
  10. data = dtrain,
  11. nrounds = 10,
  12. watchlist = watchlist,
  13. verbose = FALSE,
  14. prediction = TRUE,
  15. eval_metric = "logloss")
  16. bst_model
  1. predict(bst_model,
  2. as.matrix(test_data[, -1])) %>%
  3. as_tibble() %>%
  4. mutate(prediction = round(value),
  5. label = as.numeric(test_data$Private)-1) %>%
  6. count(prediction, label)
  1. rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))
  2. auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)
  3. ggroc(rocobj2, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  4. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  5. labs(x = "False Positive Rate (1 - Specificity)",
  6. y = "True Positive Rate (Sensivity or Recall)")+
  7. annotate("text",x = .75, y = .25,label=paste("AUC =", auc2),
  8. size = 5, family="serif")+
  9. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  10. theme_bw()+
  11. theme(panel.background = element_rect(fill = 'transparent'),
  12. axis.ticks.length = unit(0.4, "lines"),
  13. axis.ticks = element_line(color='black'),
  14. axis.line = element_line(size=.5, colour = "black"),
  15. axis.title = element_text(colour='black', size=12,face = "bold"),
  16. axis.text = element_text(colour='black',size=10,face = "bold"),
  17. text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图2

基于xgboost包的调参建模 (二)

  1. cv_model <- xgb.cv(params = params,
  2. data = dtrain,
  3. nrounds = 100,
  4. watchlist = watchlist,
  5. nfold = 5,
  6. verbose = FALSE,
  7. prediction = TRUE,
  8. eval_metric = "logloss") # prediction of cv folds
  9. cv_model$evaluation_log %>%
  10. filter(test_logloss_mean == min(test_logloss_mean))
  11. min_logloss <- min(cv_model$evaluation_log[, test_logloss_mean])
  12. min_logloss_index <- which.min(cv_model$evaluation_log[, test_logloss_mean])
  13. nround <- min_logloss_index
  14. best_param <- params
  15. bst_model_cv <- xgb.train(data=dtrain, params=best_param, nrounds=nround, nthread=6)
  16. bst_model_cv
  1. rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))
  2. auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)
  3. ggroc(rocobj3, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  4. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  5. labs(x = "False Positive Rate (1 - Specificity)",
  6. y = "True Positive Rate (Sensivity or Recall)")+
  7. annotate("text",x = .75, y = .25,label=paste("AUC =", auc3),
  8. size = 5, family="serif")+
  9. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  10. theme_bw()+
  11. theme(panel.background = element_rect(fill = 'transparent'),
  12. axis.ticks.length = unit(0.4, "lines"),
  13. axis.ticks = element_line(color='black'),
  14. axis.line = element_line(size=.5, colour = "black"),
  15. axis.title = element_text(colour='black', size=12,face = "bold"),
  16. axis.text = element_text(colour='black',size=10,face = "bold"),
  17. text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图3

基于caret包的建模

  1. X_train <- xgb.DMatrix(as.matrix(train_data %>% select(-Private)))
  2. y_train <- train_data$Private
  3. X_test <- xgb.DMatrix(as.matrix(test_data %>% select(-Private)))
  4. y_test <- test_data$Private
  5. xgb_trcontrol <- trainControl(method = "cv",
  6. number = 5,
  7. allowParallel = TRUE,
  8. verboseIter = FALSE,
  9. returnData = FALSE)
  10. xgbGrid <- expand.grid(nrounds = c(100, 200),
  11. max_depth = c(10, 15, 20, 25),
  12. colsample_bytree = seq(0.5, 0.9, length.out = 5),
  13. eta = 0.1,
  14. gamma = 0,
  15. min_child_weight = 1,
  16. subsample = 1)
  17. set.seed(123)
  18. xgb_model_caret <- train(X_train, y_train,
  19. trControl = xgb_trcontrol,
  20. tuneGrid = xgbGrid,
  21. method = "xgbTree")
  22. xgb_model_caret$bestTune
  1. pred <- predict(xgb_model_caret, newdata=X_test)
  2. print(confusionMatrix(pred, y_test))
  1. rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])
  2. auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)
  3. ggroc(rocobj4, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  4. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  5. labs(x = "False Positive Rate (1 - Specificity)",
  6. y = "True Positive Rate (Sensivity or Recall)")+
  7. annotate("text",x = .75, y = .25,label=paste("AUC =", auc4),
  8. size = 5, family="serif")+
  9. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  10. theme_bw()+
  11. theme(panel.background = element_rect(fill = 'transparent'),
  12. axis.ticks.length = unit(0.4, "lines"),
  13. axis.ticks = element_line(color='black'),
  14. axis.line = element_line(size=.5, colour = "black"),
  15. axis.title = element_text(colour='black', size=12,face = "bold"),
  16. axis.text = element_text(colour='black',size=10,face = "bold"),
  17. text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图4

Summary ROC

  1. rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))
  2. auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)
  3. rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))
  4. auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)
  5. rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))
  6. auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)
  7. rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])
  8. auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)
  9. rocboj_list <- list(xgboost=rocobj1,
  10. best=rocobj2,
  11. best_cv=rocobj3,
  12. xgb_caret=rocobj4)
  13. ggroc(rocboj_list, linetype = 1, size = 1, alpha = 1, legacy.axes = T)+
  14. geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+
  15. labs(x = "False Positive Rate (1 - Specificity)",
  16. y = "True Positive Rate (Sensivity or Recall)")+
  17. annotate("text",x = .75, y = .25,label=paste("AUC =", auc1, "(xgboost_model)"),
  18. size = 5, family="serif")+
  19. annotate("text",x = .75, y = .20,label=paste("AUC =", auc2, "(bst_model)"),
  20. size = 5, family="serif")+
  21. annotate("text",x = .75, y = .15,label=paste("AUC =", auc3, "(bst_model_cv)"),
  22. size = 5, family="serif")+
  23. annotate("text",x = .75, y = .10,label=paste("AUC =", auc4, "(xgb_model_caret)"),
  24. size = 5, family="serif")+
  25. coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+
  26. scale_colour_manual(values = c("red", "blue", "black", "green"))+
  27. theme_bw()+
  28. theme(panel.background = element_rect(fill = 'transparent'),
  29. axis.ticks.length = unit(0.4, "lines"),
  30. axis.ticks = element_line(color='black'),
  31. axis.line = element_line(size=.5, colour = "black"),
  32. axis.title = element_text(colour='black', size=12,face = "bold"),
  33. axis.text = element_text(colour='black',size=10,face = "bold"),
  34. text = element_text(size=8, color="black", family="serif"))

Machine learning: Boosting - 图5

Information

  1. sessionInfo()
  1. R version 4.0.2 (2020-06-22)
  2. Platform: x86_64-w64-mingw32/x64 (64-bit)
  3. Running under: Windows 10 x64 (build 19042)
  4. Matrix products: default
  5. locale:
  6. [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
  7. [3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
  8. [5] LC_TIME=English_United States.1252
  9. system code page: 936
  10. attached base packages:
  11. [1] stats graphics grDevices utils datasets methods base
  12. other attached packages:
  13. [1] xgboost_1.3.1.1 pROC_1.16.2 caret_6.0-86 lattice_0.20-41 ISLR_1.2 forcats_0.5.0
  14. [7] stringr_1.4.0 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 ggplot2_3.3.3 tidyverse_1.3.0
  15. [13] tibble_3.0.3 dplyr_1.0.2
  16. loaded via a namespace (and not attached):
  17. [1] httr_1.4.2 jsonlite_1.7.1 splines_4.0.2 foreach_1.5.1 prodlim_2019.11.13
  18. [6] modelr_0.1.8 assertthat_0.2.1 stats4_4.0.2 cellranger_1.1.0 yaml_2.2.1
  19. [11] ipred_0.9-9 pillar_1.4.6 backports_1.1.10 glue_1.4.2 digest_0.6.25
  20. [16] rvest_0.3.6 colorspace_1.4-1 recipes_0.1.15 gbm_2.1.8 htmltools_0.5.0
  21. [21] Matrix_1.3-2 plyr_1.8.6 timeDate_3043.102 pkgconfig_2.0.3 broom_0.7.3
  22. [26] haven_2.3.1 scales_1.1.1 gower_0.2.2 lava_1.6.8 farver_2.0.3
  23. [31] generics_0.1.0 ellipsis_0.3.1 withr_2.3.0 nnet_7.3-14 cli_2.1.0
  24. [36] survival_3.2-7 magrittr_1.5 crayon_1.3.4 readxl_1.3.1 evaluate_0.14
  25. [41] fs_1.5.0 fansi_0.4.1 nlme_3.1-150 MASS_7.3-53 xml2_1.3.2
  26. [46] class_7.3-17 rsconnect_0.8.16 tools_4.0.2 data.table_1.13.6 hms_0.5.3
  27. [51] lifecycle_0.2.0 munsell_0.5.0 reprex_0.3.0 compiler_4.0.2 e1071_1.7-4
  28. [56] rlang_0.4.7 grid_4.0.2 iterators_1.0.13 rstudioapi_0.11 labeling_0.4.2
  29. [61] rmarkdown_2.5 gtable_0.3.0 ModelMetrics_1.2.2.2 codetools_0.2-18 DBI_1.1.0
  30. [66] reshape2_1.4.4 R6_2.5.0 lubridate_1.7.9 knitr_1.30 utf8_1.1.4
  31. [71] stringi_1.5.3 Rcpp_1.0.5 vctrs_0.3.4 rpart_4.1-15 dbplyr_2.0.0
  32. [76] tidyselect_1.1.0 xfun_0.19

Reference

  1. extreme gradient boosting
  2. XGBoost R Tutorial