更多知识分享请到 https://zouhua.top/。
knitr::opts_chunk$set(warning = F, message = F)library(dplyr)library(tibble)library(tidyverse)library(ISLR)library(caret)library(pROC)library(xgboost)
Gradient Boosting in caret
| Model | method Value | Type | Libraries | Tuning Parameters |
|---|---|---|---|---|
| eXtreme Gradient Boosting | xgbDART | Classification, Regression | xgboost, plyr | nrounds, max_depth, eta, gamma, subsample, colsample_bytree, rate_drop, skip_drop, min_child_weight |
| eXtreme Gradient Boosting | xgbLinear | Classification, Regression | xgboost | nrounds, lambda, alpha, eta |
| eXtreme Gradient Boosting | xgbTree | Classification, Regression | xgboost, plyr | nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample |
| Gradient Boosting Machines | gbm_h2o | Classification, Regression | h2o | ntrees, max_depth, min_rows, learn_rate, col_sample_rate |
| Stochastic Gradient Boosting | gbm | Classification, Regression | gbm, plyr | n.trees, interaction.depth, shrinkage, n.minobsinnode |
load data
ml_data <- Collegeml_data %>%glimpse()set.seed(123)index <- createDataPartition(ml_data$Private, p = 0.7, list = FALSE)train_data <- ml_data[index, ]test_data <- ml_data[-index, ]
Stochastic Gradient Boosting
# Train model with preprocessing & repeated cvmodel_gbm <- caret::train(Private ~ .,data = train_data,method = "gbm",preProcess = c("scale", "center"),trControl = trainControl(method = "repeatedcv",number = 5,repeats = 3,verboseIter = FALSE),verbose = 0)model_gbm
caret::confusionMatrix(data = predict(model_gbm, test_data),reference = test_data$Private)predict(model_gbm, test_data, type = "raw")predict(model_gbm, test_data, type = "prob")
- ROC
rocobj0 <- roc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "No"])auc0 <- round(auc(test_data$Private, predict(model_gbm, newdata = test_data, type = "prob")[, "Yes"]),4)ggroc(rocobj0, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc0),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))
基于xgboost包的普通构建模型
xgboost_model <- xgboost(data = as.matrix(train_data[, -1]),label = as.numeric(train_data$Private)-1,max_depth = 3,objective = "binary:logistic",nrounds = 10,verbose = FALSE,prediction = TRUE,eval_metric = "logloss")xgboost_model
predict(xgboost_model,as.matrix(test_data[, -1])) %>%as.tibble() %>%mutate(prediction = round(value),label = as.numeric(test_data$Private)-1) %>%count(prediction, label)
rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)ggroc(rocobj1, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc1),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))

基于xgboost包的调参建模 (一)
dtrain <- xgb.DMatrix(as.matrix(train_data[, -1]),label = as.numeric(train_data$Private)-1)dtest <- xgb.DMatrix(as.matrix(test_data[, -1]),label = as.numeric(test_data$Private)-1)params <- list(max_depth = 3,objective = "binary:logistic",silent = 0)watchlist <- list(train = dtrain, eval = dtest)bst_model <- xgb.train(params = params,data = dtrain,nrounds = 10,watchlist = watchlist,verbose = FALSE,prediction = TRUE,eval_metric = "logloss")bst_model
predict(bst_model,as.matrix(test_data[, -1])) %>%as_tibble() %>%mutate(prediction = round(value),label = as.numeric(test_data$Private)-1) %>%count(prediction, label)
rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)ggroc(rocobj2, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc2),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))

基于xgboost包的调参建模 (二)
cv_model <- xgb.cv(params = params,data = dtrain,nrounds = 100,watchlist = watchlist,nfold = 5,verbose = FALSE,prediction = TRUE,eval_metric = "logloss") # prediction of cv foldscv_model$evaluation_log %>%filter(test_logloss_mean == min(test_logloss_mean))min_logloss <- min(cv_model$evaluation_log[, test_logloss_mean])min_logloss_index <- which.min(cv_model$evaluation_log[, test_logloss_mean])nround <- min_logloss_indexbest_param <- paramsbst_model_cv <- xgb.train(data=dtrain, params=best_param, nrounds=nround, nthread=6)bst_model_cv
rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)ggroc(rocobj3, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc3),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))

基于caret包的建模
X_train <- xgb.DMatrix(as.matrix(train_data %>% select(-Private)))y_train <- train_data$PrivateX_test <- xgb.DMatrix(as.matrix(test_data %>% select(-Private)))y_test <- test_data$Privatexgb_trcontrol <- trainControl(method = "cv",number = 5,allowParallel = TRUE,verboseIter = FALSE,returnData = FALSE)xgbGrid <- expand.grid(nrounds = c(100, 200),max_depth = c(10, 15, 20, 25),colsample_bytree = seq(0.5, 0.9, length.out = 5),eta = 0.1,gamma = 0,min_child_weight = 1,subsample = 1)set.seed(123)xgb_model_caret <- train(X_train, y_train,trControl = xgb_trcontrol,tuneGrid = xgbGrid,method = "xgbTree")xgb_model_caret$bestTune
pred <- predict(xgb_model_caret, newdata=X_test)print(confusionMatrix(pred, y_test))
rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)ggroc(rocobj4, color = "red", linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc4),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))

Summary ROC
rocobj1 <- roc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob"))auc1 <- round(auc((as.numeric(test_data$Private) - 1), predict(xgboost_model, as.matrix(test_data[, -1]), type="prob")), 4)rocobj2 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob"))auc2 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model, as.matrix(test_data[, -1]), type="prob")), 4)rocobj3 <- roc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob"))auc3 <- round(auc((as.numeric(test_data$Private) - 1), predict(bst_model_cv, as.matrix(test_data[, -1]), type="prob")), 4)rocobj4 <- roc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"])auc4 <- round(auc(y_test, predict(xgb_model_caret, newdata = X_test, type = "prob")[, "Yes"]), 4)rocboj_list <- list(xgboost=rocobj1,best=rocobj2,best_cv=rocobj3,xgb_caret=rocobj4)ggroc(rocboj_list, linetype = 1, size = 1, alpha = 1, legacy.axes = T)+geom_abline(intercept = 0, slope = 1, color="grey", size = 1, linetype=1)+labs(x = "False Positive Rate (1 - Specificity)",y = "True Positive Rate (Sensivity or Recall)")+annotate("text",x = .75, y = .25,label=paste("AUC =", auc1, "(xgboost_model)"),size = 5, family="serif")+annotate("text",x = .75, y = .20,label=paste("AUC =", auc2, "(bst_model)"),size = 5, family="serif")+annotate("text",x = .75, y = .15,label=paste("AUC =", auc3, "(bst_model_cv)"),size = 5, family="serif")+annotate("text",x = .75, y = .10,label=paste("AUC =", auc4, "(xgb_model_caret)"),size = 5, family="serif")+coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))+scale_colour_manual(values = c("red", "blue", "black", "green"))+theme_bw()+theme(panel.background = element_rect(fill = 'transparent'),axis.ticks.length = unit(0.4, "lines"),axis.ticks = element_line(color='black'),axis.line = element_line(size=.5, colour = "black"),axis.title = element_text(colour='black', size=12,face = "bold"),axis.text = element_text(colour='black',size=10,face = "bold"),text = element_text(size=8, color="black", family="serif"))

Information
sessionInfo()
R version 4.0.2 (2020-06-22)Platform: x86_64-w64-mingw32/x64 (64-bit)Running under: Windows 10 x64 (build 19042)Matrix products: defaultlocale:[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C[5] LC_TIME=English_United States.1252system code page: 936attached base packages:[1] stats graphics grDevices utils datasets methods baseother attached packages:[1] xgboost_1.3.1.1 pROC_1.16.2 caret_6.0-86 lattice_0.20-41 ISLR_1.2 forcats_0.5.0[7] stringr_1.4.0 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 ggplot2_3.3.3 tidyverse_1.3.0[13] tibble_3.0.3 dplyr_1.0.2loaded via a namespace (and not attached):[1] httr_1.4.2 jsonlite_1.7.1 splines_4.0.2 foreach_1.5.1 prodlim_2019.11.13[6] modelr_0.1.8 assertthat_0.2.1 stats4_4.0.2 cellranger_1.1.0 yaml_2.2.1[11] ipred_0.9-9 pillar_1.4.6 backports_1.1.10 glue_1.4.2 digest_0.6.25[16] rvest_0.3.6 colorspace_1.4-1 recipes_0.1.15 gbm_2.1.8 htmltools_0.5.0[21] Matrix_1.3-2 plyr_1.8.6 timeDate_3043.102 pkgconfig_2.0.3 broom_0.7.3[26] haven_2.3.1 scales_1.1.1 gower_0.2.2 lava_1.6.8 farver_2.0.3[31] generics_0.1.0 ellipsis_0.3.1 withr_2.3.0 nnet_7.3-14 cli_2.1.0[36] survival_3.2-7 magrittr_1.5 crayon_1.3.4 readxl_1.3.1 evaluate_0.14[41] fs_1.5.0 fansi_0.4.1 nlme_3.1-150 MASS_7.3-53 xml2_1.3.2[46] class_7.3-17 rsconnect_0.8.16 tools_4.0.2 data.table_1.13.6 hms_0.5.3[51] lifecycle_0.2.0 munsell_0.5.0 reprex_0.3.0 compiler_4.0.2 e1071_1.7-4[56] rlang_0.4.7 grid_4.0.2 iterators_1.0.13 rstudioapi_0.11 labeling_0.4.2[61] rmarkdown_2.5 gtable_0.3.0 ModelMetrics_1.2.2.2 codetools_0.2-18 DBI_1.1.0[66] reshape2_1.4.4 R6_2.5.0 lubridate_1.7.9 knitr_1.30 utf8_1.1.4[71] stringi_1.5.3 Rcpp_1.0.5 vctrs_0.3.4 rpart_4.1-15 dbplyr_2.0.0[76] tidyselect_1.1.0 xfun_0.19
