数据探索

  1. library(tidyverse)
  2. ## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
  3. ## v ggplot2 3.3.3 v purrr 0.3.4
  4. ## v tibble 3.1.1 v dplyr 1.0.6
  5. ## v tidyr 1.1.3 v stringr 1.4.0
  6. ## v readr 1.4.0 v forcats 0.5.1
  7. ## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
  8. ## x dplyr::filter() masks stats::filter()
  9. ## x dplyr::lag() masks stats::lag()
  10. turbines <- read_csv("../datasets/tidytuesday/data/2020/2020-10-27/wind-turbine.csv")
  11. ##
  12. ## -- Column specification ----------------------------------------------------------------------------
  13. ## cols(
  14. ## objectid = col_double(),
  15. ## province_territory = col_character(),
  16. ## project_name = col_character(),
  17. ## total_project_capacity_mw = col_double(),
  18. ## turbine_identifier = col_character(),
  19. ## turbine_number_in_project = col_character(),
  20. ## turbine_rated_capacity_k_w = col_double(),
  21. ## rotor_diameter_m = col_double(),
  22. ## hub_height_m = col_double(),
  23. ## manufacturer = col_character(),
  24. ## model = col_character(),
  25. ## commissioning_date = col_character(),
  26. ## latitude = col_double(),
  27. ## longitude = col_double(),
  28. ## notes = col_character()
  29. ## )
  30. turbines
  31. ## # A tibble: 6,698 x 15
  32. ## objectid province_territo~ project_name total_project_ca~ turbine_identif~
  33. ## <dbl> <chr> <chr> <dbl> <chr>
  34. ## 1 1 Alberta Optimist Wind ~ 0.9 OWE1
  35. ## 2 2 Alberta Castle River W~ 44 CRW1
  36. ## 3 3 Alberta Waterton Wind ~ 3.78 WWT1
  37. ## 4 4 Alberta Waterton Wind ~ 3.78 WWT2
  38. ## 5 5 Alberta Waterton Wind ~ 3.78 WWT3
  39. ## 6 6 Alberta Waterton Wind ~ 3.78 WWT4
  40. ## 7 7 Alberta Cowley North 19.5 CON1
  41. ## 8 8 Alberta Cowley North 19.5 CON2
  42. ## 9 9 Alberta Cowley North 19.5 CON3
  43. ## 10 10 Alberta Cowley North 19.5 CON4
  44. ## # ... with 6,688 more rows, and 10 more variables:
  45. ## # turbine_number_in_project <chr>, turbine_rated_capacity_k_w <dbl>,
  46. ## # rotor_diameter_m <dbl>, hub_height_m <dbl>, manufacturer <chr>,
  47. ## # model <chr>, commissioning_date <chr>, latitude <dbl>, longitude <dbl>,
  48. ## # notes <chr>
  1. turbines_df <- turbines %>%
  2. transmute(
  3. turbine_capacity = turbine_rated_capacity_k_w,
  4. rotor_diameter_m,
  5. hub_height_m,
  6. commissioning_date = parse_number(commissioning_date),
  7. province_territory = fct_lump_n(province_territory, 10),
  8. model = fct_lump_n(model, 10)
  9. ) %>%
  10. filter(!is.na(turbine_capacity)) %>%
  11. mutate_if(is.character, factor)
  1. turbines_df %>%
  2. select(turbine_capacity:commissioning_date) %>%
  3. pivot_longer(rotor_diameter_m:commissioning_date) %>%
  4. ggplot(aes(turbine_capacity, value)) +
  5. geom_hex(bins = 15, alpha = 0.8) +
  6. geom_smooth(method = "lm") +
  7. facet_wrap(~name, scales = "free_y") +
  8. labs(y = NULL) +
  9. scale_fill_gradient(high = "cyan3")
  10. ## `geom_smooth()` using formula 'y ~ x'

tidymodels-exercise-07 - 图1

建立模型

  1. library(tidymodels)
  2. ## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
  3. ## v broom 0.7.6 v rsample 0.0.9
  4. ## v dials 0.0.9 v tune 0.1.5
  5. ## v infer 0.5.4 v workflows 0.2.2
  6. ## v modeldata 0.1.0 v workflowsets 0.0.2
  7. ## v parsnip 0.1.5 v yardstick 0.0.8
  8. ## v recipes 0.1.16
  9. ## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
  10. ## x scales::discard() masks purrr::discard()
  11. ## x dplyr::filter() masks stats::filter()
  12. ## x recipes::fixed() masks stringr::fixed()
  13. ## x dplyr::lag() masks stats::lag()
  14. ## x yardstick::spec() masks readr::spec()
  15. ## x recipes::step() masks stats::step()
  16. ## * Use tidymodels_prefer() to resolve common conflicts.
  17. turbines_df <- turbines_df[1:300,]
  18. set.seed(123)
  19. wind_split <- initial_split(turbines_df, strata = turbine_capacity)
  20. wind_train <- training(wind_split)
  21. wind_test <- testing(wind_split)
  22. set.seed(234)
  23. wind_folds <- vfold_cv(wind_train, strata = turbine_capacity)
  24. wind_folds
  25. ## # 10-fold cross-validation using stratification
  26. ## # A tibble: 10 x 2
  27. ## splits id
  28. ## <list> <chr>
  29. ## 1 <split [200/25]> Fold01
  30. ## 2 <split [202/23]> Fold02
  31. ## 3 <split [202/23]> Fold03
  32. ## 4 <split [203/22]> Fold04
  33. ## 5 <split [203/22]> Fold05
  34. ## 6 <split [203/22]> Fold06
  35. ## 7 <split [203/22]> Fold07
  36. ## 8 <split [203/22]> Fold08
  37. ## 9 <split [203/22]> Fold09
  38. ## 10 <split [203/22]> Fold10
  1. tree_spec <- decision_tree(
  2. cost_complexity = tune(),
  3. tree_depth = tune(),
  4. min_n = tune()
  5. ) %>%
  6. set_engine("rpart") %>%
  7. set_mode("regression")
  8. tree_spec
  9. ## Decision Tree Model Specification (regression)
  10. ##
  11. ## Main Arguments:
  12. ## cost_complexity = tune()
  13. ## tree_depth = tune()
  14. ## min_n = tune()
  15. ##
  16. ## Computational engine: rpart
  1. tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
  2. tree_grid
  3. ## # A tibble: 64 x 3
  4. ## cost_complexity tree_depth min_n
  5. ## <dbl> <int> <int>
  6. ## 1 0.0000000001 1 2
  7. ## 2 0.0000001 1 2
  8. ## 3 0.0001 1 2
  9. ## 4 0.1 1 2
  10. ## 5 0.0000000001 5 2
  11. ## 6 0.0000001 5 2
  12. ## 7 0.0001 5 2
  13. ## 8 0.1 5 2
  14. ## 9 0.0000000001 10 2
  15. ## 10 0.0000001 10 2
  16. ## # ... with 54 more rows
  1. doParallel::registerDoParallel()
  2. set.seed(345)
  3. tree_rs <- tune_grid(
  4. tree_spec,
  5. turbine_capacity ~ .,
  6. resamples = wind_folds,
  7. grid = tree_grid,
  8. metrics = metric_set(rmse, rsq, mae, mape)
  9. )
  10. tree_rs
  11. ## # Tuning results
  12. ## # 10-fold cross-validation using stratification
  13. ## # A tibble: 10 x 4
  14. ## splits id .metrics .notes
  15. ## <list> <chr> <list> <list>
  16. ## 1 <split [200/25]> Fold01 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  17. ## 2 <split [202/23]> Fold02 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  18. ## 3 <split [202/23]> Fold03 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  19. ## 4 <split [203/22]> Fold04 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  20. ## 5 <split [203/22]> Fold05 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  21. ## 6 <split [203/22]> Fold06 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  22. ## 7 <split [203/22]> Fold07 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  23. ## 8 <split [203/22]> Fold08 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  24. ## 9 <split [203/22]> Fold09 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
  25. ## 10 <split [203/22]> Fold10 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>

评价模型

  1. collect_metrics(tree_rs)
  2. ## # A tibble: 256 x 9
  3. ## cost_complexity tree_depth min_n .metric .estimator mean n std_err
  4. ## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
  5. ## 1 0.0000000001 1 2 mae standard 224. 10 19.4
  6. ## 2 0.0000000001 1 2 mape standard 13.3 10 1.56
  7. ## 3 0.0000000001 1 2 rmse standard 395. 10 33.7
  8. ## 4 0.0000000001 1 2 rsq standard 0.683 10 0.0308
  9. ## 5 0.0000001 1 2 mae standard 224. 10 19.4
  10. ## 6 0.0000001 1 2 mape standard 13.3 10 1.56
  11. ## 7 0.0000001 1 2 rmse standard 395. 10 33.7
  12. ## 8 0.0000001 1 2 rsq standard 0.683 10 0.0308
  13. ## 9 0.0001 1 2 mae standard 224. 10 19.4
  14. ## 10 0.0001 1 2 mape standard 13.3 10 1.56
  15. ## # ... with 246 more rows, and 1 more variable: .config <chr>
  1. autoplot(tree_rs) + theme_light(base_family = "IBMPlexSans")

tidymodels-exercise-07 - 图2

  1. show_best(tree_rs, "mape")
  2. ## # A tibble: 5 x 9
  3. ## cost_complexity tree_depth min_n .metric .estimator mean n std_err
  4. ## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
  5. ## 1 0.0000000001 5 2 mape standard 1.37 10 1.19
  6. ## 2 0.0000001 5 2 mape standard 1.37 10 1.19
  7. ## 3 0.0001 5 2 mape standard 1.37 10 1.19
  8. ## 4 0.0000000001 10 2 mape standard 1.37 10 1.19
  9. ## 5 0.0000001 10 2 mape standard 1.37 10 1.19
  10. ## # ... with 1 more variable: .config <chr>
  1. select_best(tree_rs, "rmse")
  2. ## # A tibble: 1 x 4
  3. ## cost_complexity tree_depth min_n .config
  4. ## <dbl> <int> <int> <chr>
  5. ## 1 0.0000000001 5 2 Preprocessor1_Model05
  1. final_tree <- finalize_model(tree_spec, select_best(tree_rs, "rmse"))
  2. final_tree
  3. ## Decision Tree Model Specification (regression)
  4. ##
  5. ## Main Arguments:
  6. ## cost_complexity = 1e-10
  7. ## tree_depth = 5
  8. ## min_n = 2
  9. ##
  10. ## Computational engine: rpart
  1. final_fit <- fit(final_tree, turbine_capacity ~ ., wind_train)
  2. final_rs <- last_fit(final_tree, turbine_capacity ~ ., wind_split)
  1. predict(final_fit, wind_train[100, ])
  2. ## # A tibble: 1 x 1
  3. ## .pred
  4. ## <dbl>
  5. ## 1 660
  1. predict(final_rs$.workflow[[1]], wind_train[100, ])
  2. ## # A tibble: 1 x 1
  3. ## .pred
  4. ## <dbl>
  5. ## 1 660
  1. library(vip)
  2. ##
  3. ## Attaching package: 'vip'
  4. ## The following object is masked from 'package:utils':
  5. ##
  6. ## vi
  7. final_fit %>%
  8. vip(geom = "col", aesthetics = list(fill = "midnightblue", alpha = 0.8)) +
  9. scale_y_continuous(expand = c(0, 0))

tidymodels-exercise-07 - 图3

  1. library(parttree)
  2. ex_fit <- fit(
  3. final_tree,
  4. turbine_capacity ~ rotor_diameter_m + commissioning_date,
  5. wind_train
  6. )
  7. wind_train %>%
  8. ggplot(aes(rotor_diameter_m, commissioning_date)) +
  9. geom_parttree(data = ex_fit, aes(fill = turbine_capacity), alpha = 0.3) +
  10. geom_jitter(alpha = 0.7, width = 1, height = 0.5, aes(color = turbine_capacity)) +
  11. scale_colour_viridis_c(aesthetics = c("color", "fill"))

tidymodels-exercise-07 - 图4

  1. collect_metrics(final_rs)
  2. ## # A tibble: 2 x 4
  3. ## .metric .estimator .estimate .config
  4. ## <chr> <chr> <dbl> <chr>
  5. ## 1 rmse standard 10.4 Preprocessor1_Model1
  6. ## 2 rsq standard 1.00 Preprocessor1_Model1
  1. final_rs %>%
  2. collect_predictions() %>%
  3. ggplot(aes(turbine_capacity, .pred)) +
  4. geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
  5. geom_point(alpha = 0.6, color = "midnightblue") +
  6. coord_fixed()

tidymodels-exercise-07 - 图5