同时筛选多个模型

  1. library(tidymodels)
  2. tidymodels_prefer()
  3. data(concrete, package = 'modeldata')
  4. glimpse(concrete)
  1. Registered S3 method overwritten by 'tune':
  2. method from
  3. required_pkgs.model_spec parsnip
  4. -- Attaching packages --------------------------------------------- tidymodels 0.1.3 --
  5. broom 0.7.9 recipes 0.1.16
  6. dials 0.0.9 rsample 0.1.0
  7. dplyr 1.0.7 tibble 3.1.3
  8. ggplot2 3.3.5 tidyr 1.1.3
  9. infer 1.0.0 tune 0.1.6
  10. modeldata 0.1.1 workflows 0.2.3
  11. parsnip 0.1.7 workflowsets 0.1.0
  12. purrr 0.3.4 yardstick 0.0.8
  13. -- Conflicts ------------------------------------------------ tidymodels_conflicts() --
  14. x purrr::discard() masks scales::discard()
  15. x dplyr::filter() masks stats::filter()
  16. x dplyr::lag() masks stats::lag()
  17. x recipes::step() masks stats::step()
  18. * Use tidymodels_prefer() to resolve common conflicts.
  19. Rows: 1,030
  20. Columns: 9
  21. $ cement <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 266.0, 380.0, 380.0, ~
  22. $ blast_furnace_slag <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 114.0, 95.0, 95.0, 114.0,~
  23. $ fly_ash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
  24. $ water <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 228, 228, 192, 1~
  25. $ superplasticizer <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
  26. $ coarse_aggregate <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0, 932.0~
  27. $ fine_aggregate <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 670.0, 594.0, 594.0, ~
  28. $ age <int> 28, 28, 270, 365, 360, 90, 365, 28, 28, 28, 90, 28, 270,~
  29. $ compressive_strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 47.03, 43.70, 36.45, ~
  1. concrete <-
  2. concrete %>%
  3. group_by(cement, blast_furnace_slag, fly_ash, water, superplasticizer,
  4. coarse_aggregate, fine_aggregate, age) %>%
  5. summarize(compressive_strength = mean(compressive_strength),
  6. .groups = "drop")
  7. nrow(concrete)
  1. [1] 992

数据分割,交叉验证

  1. set.seed(1501)
  2. concrete_split <- initial_split(concrete, strata = compressive_strength)
  3. concrete_train <- training(concrete_split)
  4. concrete_test <- testing(concrete_split)
  5. set.seed(1502)
  6. concrete_folds <-
  7. vfold_cv(concrete_train, strata = compressive_strength, repeats = 5)

简单的预处理

  1. normalized_rec <-
  2. recipe(compressive_strength ~ ., data = concrete_train) %>%
  3. step_normalize(all_predictors())
  4. poly_recipe <-
  5. normalized_rec %>%
  6. step_poly(all_predictors()) %>%
  7. step_interact(~ all_predictors():all_predictors())

建立多个模型

  1. library(rules)
  2. library(baguette)
  3. linear_reg_spec <-
  4. linear_reg(penalty = tune(), mixture = tune()) %>%
  5. set_engine("glmnet")
  6. nnet_spec <-
  7. mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>%
  8. set_engine("nnet", MaxNWts = 2600) %>%
  9. set_mode("regression")
  10. mars_spec <-
  11. mars(prod_degree = tune()) %>% #<- use GCV to choose terms
  12. set_engine("earth") %>%
  13. set_mode("regression")
  14. svm_r_spec <-
  15. svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
  16. set_engine("kernlab") %>%
  17. set_mode("regression")
  18. svm_p_spec <-
  19. svm_poly(cost = tune(), degree = tune()) %>%
  20. set_engine("kernlab") %>%
  21. set_mode("regression")
  22. knn_spec <-
  23. nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>%
  24. set_engine("kknn") %>%
  25. set_mode("regression")
  26. cart_spec <-
  27. decision_tree(cost_complexity = tune(), min_n = tune()) %>%
  28. set_engine("rpart") %>%
  29. set_mode("regression")
  30. bag_cart_spec <-
  31. bag_tree() %>%
  32. set_engine("rpart", times = 50L) %>%
  33. set_mode("regression")
  34. rf_spec <-
  35. rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  36. set_engine("ranger") %>%
  37. set_mode("regression")
  38. xgb_spec <-
  39. boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(),
  40. min_n = tune(), sample_size = tune(), trees = tune()) %>%
  41. set_engine("xgboost") %>%
  42. set_mode("regression")
  43. cubist_spec <-
  44. cubist_rules(committees = tune(), neighbors = tune()) %>%
  45. set_engine("Cubist")
  1. nnet_param <-
  2. nnet_spec %>%
  3. parameters() %>%
  4. update(hidden_units = hidden_units(c(1, 27)))

不同的预处理步骤

  1. normalized <-
  2. workflow_set(
  3. preproc = list(normalized = normalized_rec),
  4. models = list(SVM_radial = svm_r_spec, SVM_poly = svm_p_spec,
  5. KNN = knn_spec, neural_network = nnet_spec)
  6. )
  7. normalized

tidymodels-exercise-09 - 图1

随便挑选一个看看

  1. normalized %>% pull_workflow(id = "normalized_KNN")
  1. == Workflow ===========================================================================
  2. Preprocessor: Recipe
  3. Model: nearest_neighbor()
  4. -- Preprocessor -----------------------------------------------------------------------
  5. 1 Recipe Step
  6. * step_normalize()
  7. -- Model ------------------------------------------------------------------------------
  8. K-Nearest Neighbor Model Specification (regression)
  9. Main Arguments:
  10. neighbors = tune()
  11. weight_func = tune()
  12. dist_power = tune()
  13. Computational engine: kknn
  1. normalized <-
  2. normalized %>%
  3. option_add(param = nnet_param, id = "normalized_neural_network")
  4. normalized

tidymodels-exercise-09 - 图2

  1. model_vars <-
  2. workflow_variables(outcomes = compressive_strength,
  3. predictors = everything())
  4. no_pre_proc <-
  5. workflow_set(
  6. preproc = list(simple = model_vars),
  7. models = list(MARS = mars_spec, CART = cart_spec, CART_bagged = bag_cart_spec,
  8. RF = rf_spec, boosting = xgb_spec, Cubist = cubist_spec)
  9. )
  10. no_pre_proc

tidymodels-exercise-09 - 图3

  1. with_features <-
  2. workflow_set(
  3. preproc = list(full_quad = poly_recipe),
  4. models = list(linear_reg = linear_reg_spec, KNN = knn_spec)
  5. )
  1. all_workflows <-
  2. bind_rows(no_pre_proc, normalized, with_features) %>%
  3. # Make the workflow ID's a little more simple:
  4. mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
  5. all_workflows

tidymodels-exercise-09 - 图4

下面开始训练模型

这一步非常浪费时间,给大家一个参考,我的配置是AMD 5900X,内存是Fury3600 32G X 2

  1. grid_ctrl <-
  2. control_grid(
  3. save_pred = TRUE,
  4. parallel_over = "everything",
  5. save_workflow = TRUE
  6. )
  7. grid_results <-
  8. all_workflows %>%
  9. workflow_map(
  10. seed = 1503,
  11. resamples = concrete_folds,
  12. grid = 25,
  13. control = grid_ctrl
  14. )
  1. i 1 of 12 tuning: MARS
  2. 1 of 12 tuning: MARS (11.2s)
  3. i 2 of 12 tuning: CART
  4. 2 of 12 tuning: CART (1m 28.1s)
  5. i No tuning parameters. `fit_resamples()` will be attempted
  6. i 3 of 12 resampling: CART_bagged
  7. 3 of 12 resampling: CART_bagged (2m 22.4s)
  8. i 4 of 12 tuning: RF
  9. i Creating pre-processing data to finalize unknown parameter: mtry
  10. 4 of 12 tuning: RF (3m 7.6s)
  11. i 5 of 12 tuning: boosting
  12. 5 of 12 tuning: boosting (4m 50.9s)
  13. i 6 of 12 tuning: Cubist
  14. 6 of 12 tuning: Cubist (5m 6.5s)
  15. i 7 of 12 tuning: SVM_radial
  16. 7 of 12 tuning: SVM_radial (2m 4.4s)
  17. i 8 of 12 tuning: SVM_poly
  18. 8 of 12 tuning: SVM_poly (14m 4.8s)
  19. i 9 of 12 tuning: KNN
  20. 9 of 12 tuning: KNN (2m 57.5s)
  21. i 10 of 12 tuning: neural_network
  22. Warning: The `...` are not used in this function but one or more objects were passed: 'param'
  23. 10 of 12 tuning: neural_network (2m 48.1s)
  24. i 11 of 12 tuning: full_quad_linear_reg
  25. 11 of 12 tuning: full_quad_linear_reg (2m 40.8s)
  26. i 12 of 12 tuning: full_quad_KNN
  27. 12 of 12 tuning: full_quad_KNN (23m 7.7s)

结果

  1. grid_results

tidymodels-exercise-09 - 图5

按照某一标准(RMSE)排列结果:

  1. grid_results %>%
  2. rank_results() %>%
  3. filter(.metric == "rmse") %>%
  4. select(model, .config, rmse = mean, rank)

tidymodels-exercise-09 - 图6

可视化结果

  1. autoplot(
  2. grid_results,
  3. rank_metric = "rmse", # <- how to order models
  4. metric = "rmse", # <- which metric to visualize
  5. select_best = TRUE # <- one point per workflow
  6. )

tidymodels-exercise-09 - 图7

  1. autoplot(grid_results, id = "Cubist", metric = "rmse")

tidymodels-exercise-09 - 图8

更加快捷的方式筛选多个模型

  1. library(finetune)
  2. race_ctrl <-
  3. control_race(
  4. save_pred = TRUE,
  5. parallel_over = "everything",
  6. save_workflow = TRUE
  7. )
  8. race_results <-
  9. all_workflows %>%
  10. workflow_map(
  11. "tune_race_anova", # 这个方法更快
  12. seed = 1503,
  13. resamples = concrete_folds,
  14. grid = 25,
  15. control = race_ctrl
  16. )

可视化结果:

  1. autoplot(
  2. race_results,
  3. rank_metric = "rmse",
  4. metric = "rmse",
  5. select_best = TRUE
  6. )

tidymodels-exercise-09 - 图9

比较一下两种方法,看差别大不大

  1. matched_results <-
  2. rank_results(race_results, select_best = TRUE) %>%
  3. select(wflow_id, .metric, race = mean, config_race = .config) %>%
  4. inner_join(
  5. rank_results(grid_results, select_best = TRUE) %>%
  6. select(wflow_id, .metric, complete = mean,
  7. config_complete = .config, model),
  8. by = c("wflow_id", ".metric"),
  9. ) %>%
  10. filter(.metric == "rmse")
  11. matched_results %>%
  12. ggplot(aes(x = complete, y = race)) +
  13. geom_abline(lty = 3) +
  14. geom_point(aes(col = model)) +
  15. coord_obs_pred() +
  16. labs(x = "Complete Grid RMSE", y = "Racing RMSE")

tidymodels-exercise-09 - 图10

选择最后的模型

  1. best_results <-
  2. race_results %>%
  3. extract_workflow_set_result("boosting") %>%
  4. select_best(metric = "rmse")
  5. best_results

tidymodels-exercise-09 - 图11

  1. boosting_test_results <-
  2. race_results %>%
  3. extract_workflow("boosting") %>%
  4. finalize_workflow(best_results) %>%
  5. last_fit(split = concrete_split)
  1. collect_metrics(boosting_test_results)

tidymodels-exercise-09 - 图12

  1. boosting_test_results %>%
  2. collect_predictions() %>%
  3. ggplot(aes(x = compressive_strength, y = .pred)) +
  4. geom_abline(col = "green", lty = 2) +
  5. geom_point(alpha = 0.5) +
  6. coord_obs_pred() +
  7. labs(x = "observed", y = "predicted")

tidymodels-exercise-09 - 图13