同时筛选多个模型

library(tidymodels)
tidymodels_prefer()
data(concrete, package = 'modeldata')
glimpse(concrete)

Registered S3 method overwritten by 'tune':
  method                   from   
  required_pkgs.model_spec parsnip
-- Attaching packages --------------------------------------------- tidymodels 0.1.3 --
√ broom        0.7.9      √ recipes      0.1.16
√ dials        0.0.9      √ rsample      0.1.0 
√ dplyr        1.0.7      √ tibble       3.1.3 
√ ggplot2      3.3.5      √ tidyr        1.1.3 
√ infer        1.0.0      √ tune         0.1.6 
√ modeldata    0.1.1      √ workflows    0.2.3 
√ parsnip      0.1.7      √ workflowsets 0.1.0 
√ purrr        0.3.4      √ yardstick    0.0.8 
-- Conflicts ------------------------------------------------ tidymodels_conflicts() --
x purrr::discard() masks scales::discard()
x dplyr::filter()  masks stats::filter()
x dplyr::lag()     masks stats::lag()
x recipes::step()  masks stats::step()
* Use tidymodels_prefer() to resolve common conflicts.
Rows: 1,030
Columns: 9
$ cement               <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 266.0, 380.0, 380.0, ~
$ blast_furnace_slag   <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 114.0, 95.0, 95.0, 114.0,~
$ fly_ash              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
$ water                <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 228, 228, 192, 1~
$ superplasticizer     <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
$ coarse_aggregate     <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0, 932.0~
$ fine_aggregate       <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 670.0, 594.0, 594.0, ~
$ age                  <int> 28, 28, 270, 365, 360, 90, 365, 28, 28, 28, 90, 28, 270,~
$ compressive_strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 47.03, 43.70, 36.45, ~

concrete <- 
   concrete %>% 
   group_by(cement, blast_furnace_slag, fly_ash, water, superplasticizer, 
            coarse_aggregate, fine_aggregate, age) %>% 
   summarize(compressive_strength = mean(compressive_strength),
             .groups = "drop")
nrow(concrete)

[1] 992

数据分割，交叉验证

set.seed(1501)
concrete_split <- initial_split(concrete, strata = compressive_strength)
concrete_train <- training(concrete_split)
concrete_test  <- testing(concrete_split)
set.seed(1502)
concrete_folds <- 
   vfold_cv(concrete_train, strata = compressive_strength, repeats = 5)

简单的预处理

normalized_rec <- 
   recipe(compressive_strength ~ ., data = concrete_train) %>% 
   step_normalize(all_predictors()) 
poly_recipe <- 
   normalized_rec %>% 
   step_poly(all_predictors()) %>% 
   step_interact(~ all_predictors():all_predictors())

建立多个模型

library(rules)
library(baguette)
linear_reg_spec <- 
   linear_reg(penalty = tune(), mixture = tune()) %>% 
   set_engine("glmnet")
nnet_spec <- 
   mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>% 
   set_engine("nnet", MaxNWts = 2600) %>% 
   set_mode("regression")
mars_spec <- 
   mars(prod_degree = tune()) %>%  #<- use GCV to choose terms
   set_engine("earth") %>% 
   set_mode("regression")
svm_r_spec <- 
   svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
   set_engine("kernlab") %>% 
   set_mode("regression")
svm_p_spec <- 
   svm_poly(cost = tune(), degree = tune()) %>% 
   set_engine("kernlab") %>% 
   set_mode("regression")
knn_spec <- 
   nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>% 
   set_engine("kknn") %>% 
   set_mode("regression")
cart_spec <- 
   decision_tree(cost_complexity = tune(), min_n = tune()) %>% 
   set_engine("rpart") %>% 
   set_mode("regression")
bag_cart_spec <- 
   bag_tree() %>% 
   set_engine("rpart", times = 50L) %>% 
   set_mode("regression")
rf_spec <- 
   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
   set_engine("ranger") %>% 
   set_mode("regression")
xgb_spec <- 
   boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
              min_n = tune(), sample_size = tune(), trees = tune()) %>% 
   set_engine("xgboost") %>% 
   set_mode("regression")
cubist_spec <- 
   cubist_rules(committees = tune(), neighbors = tune()) %>% 
   set_engine("Cubist")

nnet_param <- 
   nnet_spec %>% 
   parameters() %>% 
   update(hidden_units = hidden_units(c(1, 27)))

不同的预处理步骤

normalized <- 
   workflow_set(
      preproc = list(normalized = normalized_rec), 
      models = list(SVM_radial = svm_r_spec, SVM_poly = svm_p_spec, 
                    KNN = knn_spec, neural_network = nnet_spec)
   )
normalized

tidymodels-exercise-09 - 图1

随便挑选一个看看

normalized %>% pull_workflow(id = "normalized_KNN")

== Workflow ===========================================================================
Preprocessor: Recipe
Model: nearest_neighbor()
-- Preprocessor -----------------------------------------------------------------------
1 Recipe Step
* step_normalize()
-- Model ------------------------------------------------------------------------------
K-Nearest Neighbor Model Specification (regression)
Main Arguments:
  neighbors = tune()
  weight_func = tune()
  dist_power = tune()
Computational engine: kknn

normalized <- 
   normalized %>% 
   option_add(param = nnet_param, id = "normalized_neural_network")
normalized

tidymodels-exercise-09 - 图2

model_vars <- 
   workflow_variables(outcomes = compressive_strength, 
                      predictors = everything())
no_pre_proc <- 
   workflow_set(
      preproc = list(simple = model_vars), 
      models = list(MARS = mars_spec, CART = cart_spec, CART_bagged = bag_cart_spec,
                    RF = rf_spec, boosting = xgb_spec, Cubist = cubist_spec)
   )
no_pre_proc

tidymodels-exercise-09 - 图3

with_features <- 
   workflow_set(
      preproc = list(full_quad = poly_recipe), 
      models = list(linear_reg = linear_reg_spec, KNN = knn_spec)
   )

all_workflows <- 
   bind_rows(no_pre_proc, normalized, with_features) %>% 
   # Make the workflow ID's a little more simple: 
   mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows

tidymodels-exercise-09 - 图4

下面开始训练模型

这一步非常浪费时间，给大家一个参考，我的配置是AMD 5900X，内存是Fury3600 32G X 2

grid_ctrl <-
   control_grid(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )
grid_results <-
   all_workflows %>%
   workflow_map(
      seed = 1503,
      resamples = concrete_folds,
      grid = 25,
      control = grid_ctrl
   )

i  1 of 12 tuning:     MARS
√  1 of 12 tuning:     MARS (11.2s)
i  2 of 12 tuning:     CART
√  2 of 12 tuning:     CART (1m 28.1s)
i    No tuning parameters. `fit_resamples()` will be attempted
i  3 of 12 resampling: CART_bagged
√  3 of 12 resampling: CART_bagged (2m 22.4s)
i  4 of 12 tuning:     RF
i Creating pre-processing data to finalize unknown parameter: mtry
√  4 of 12 tuning:     RF (3m 7.6s)
i  5 of 12 tuning:     boosting
√  5 of 12 tuning:     boosting (4m 50.9s)
i  6 of 12 tuning:     Cubist
√  6 of 12 tuning:     Cubist (5m 6.5s)
i  7 of 12 tuning:     SVM_radial
√  7 of 12 tuning:     SVM_radial (2m 4.4s)
i  8 of 12 tuning:     SVM_poly
√  8 of 12 tuning:     SVM_poly (14m 4.8s)
i  9 of 12 tuning:     KNN
√  9 of 12 tuning:     KNN (2m 57.5s)
i 10 of 12 tuning:     neural_network
Warning: The `...` are not used in this function but one or more objects were passed: 'param'
√ 10 of 12 tuning:     neural_network (2m 48.1s)
i 11 of 12 tuning:     full_quad_linear_reg
√ 11 of 12 tuning:     full_quad_linear_reg (2m 40.8s)
i 12 of 12 tuning:     full_quad_KNN
√ 12 of 12 tuning:     full_quad_KNN (23m 7.7s)

结果

grid_results

tidymodels-exercise-09 - 图5

按照某一标准（RMSE）排列结果：

grid_results %>% 
   rank_results() %>% 
   filter(.metric == "rmse") %>% 
   select(model, .config, rmse = mean, rank)

tidymodels-exercise-09 - 图6

可视化结果

autoplot(
   grid_results,
   rank_metric = "rmse",  # <- how to order models
   metric = "rmse",       # <- which metric to visualize
   select_best = TRUE     # <- one point per workflow
)

tidymodels-exercise-09 - 图7

autoplot(grid_results, id = "Cubist", metric = "rmse")

tidymodels-exercise-09 - 图8

更加快捷的方式筛选多个模型

library(finetune)
race_ctrl <-
   control_race(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )
race_results <-
   all_workflows %>%
   workflow_map(
      "tune_race_anova", # 这个方法更快
      seed = 1503,
      resamples = concrete_folds,
      grid = 25,
      control = race_ctrl
   )

可视化结果：

autoplot(
   race_results,
   rank_metric = "rmse",  
   metric = "rmse",       
   select_best = TRUE    
)

tidymodels-exercise-09 - 图9

比较一下两种方法，看差别大不大

matched_results <- 
   rank_results(race_results, select_best = TRUE) %>% 
   select(wflow_id, .metric, race = mean, config_race = .config) %>% 
   inner_join(
      rank_results(grid_results, select_best = TRUE) %>% 
         select(wflow_id, .metric, complete = mean, 
                config_complete = .config, model),
      by = c("wflow_id", ".metric"),
   ) %>%  
   filter(.metric == "rmse")
matched_results %>% 
   ggplot(aes(x = complete, y = race)) + 
   geom_abline(lty = 3) + 
   geom_point(aes(col = model)) + 
   coord_obs_pred() + 
   labs(x = "Complete Grid RMSE", y = "Racing RMSE")

tidymodels-exercise-09 - 图10

选择最后的模型

best_results <- 
   race_results %>% 
   extract_workflow_set_result("boosting") %>% 
   select_best(metric = "rmse")
best_results

tidymodels-exercise-09 - 图11

boosting_test_results <- 
   race_results %>% 
   extract_workflow("boosting") %>% 
   finalize_workflow(best_results) %>% 
   last_fit(split = concrete_split)

collect_metrics(boosting_test_results)

tidymodels-exercise-09 - 图12

boosting_test_results %>% 
   collect_predictions() %>% 
   ggplot(aes(x = compressive_strength, y = .pred)) + 
   geom_abline(col = "green", lty = 2) + 
   geom_point(alpha = 0.5) + 
   coord_obs_pred() + 
   labs(x = "observed", y = "predicted")

tidymodels-exercise-09 - 图13