数据探索
library(tidyverse)## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --## v ggplot2 3.3.3 v purrr 0.3.4## v tibble 3.1.1 v dplyr 1.0.6## v tidyr 1.1.3 v stringr 1.4.0## v readr 1.4.0 v forcats 0.5.1## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()turbines <- read_csv("../datasets/tidytuesday/data/2020/2020-10-27/wind-turbine.csv")## ## -- Column specification ----------------------------------------------------------------------------## cols(## objectid = col_double(),## province_territory = col_character(),## project_name = col_character(),## total_project_capacity_mw = col_double(),## turbine_identifier = col_character(),## turbine_number_in_project = col_character(),## turbine_rated_capacity_k_w = col_double(),## rotor_diameter_m = col_double(),## hub_height_m = col_double(),## manufacturer = col_character(),## model = col_character(),## commissioning_date = col_character(),## latitude = col_double(),## longitude = col_double(),## notes = col_character()## )turbines## # A tibble: 6,698 x 15## objectid province_territo~ project_name total_project_ca~ turbine_identif~## <dbl> <chr> <chr> <dbl> <chr> ## 1 1 Alberta Optimist Wind ~ 0.9 OWE1 ## 2 2 Alberta Castle River W~ 44 CRW1 ## 3 3 Alberta Waterton Wind ~ 3.78 WWT1 ## 4 4 Alberta Waterton Wind ~ 3.78 WWT2 ## 5 5 Alberta Waterton Wind ~ 3.78 WWT3 ## 6 6 Alberta Waterton Wind ~ 3.78 WWT4 ## 7 7 Alberta Cowley North 19.5 CON1 ## 8 8 Alberta Cowley North 19.5 CON2 ## 9 9 Alberta Cowley North 19.5 CON3 ## 10 10 Alberta Cowley North 19.5 CON4 ## # ... with 6,688 more rows, and 10 more variables:## # turbine_number_in_project <chr>, turbine_rated_capacity_k_w <dbl>,## # rotor_diameter_m <dbl>, hub_height_m <dbl>, manufacturer <chr>,## # model <chr>, commissioning_date <chr>, latitude <dbl>, longitude <dbl>,## # notes <chr>
turbines_df <- turbines %>% transmute( turbine_capacity = turbine_rated_capacity_k_w, rotor_diameter_m, hub_height_m, commissioning_date = parse_number(commissioning_date), province_territory = fct_lump_n(province_territory, 10), model = fct_lump_n(model, 10) ) %>% filter(!is.na(turbine_capacity)) %>% mutate_if(is.character, factor)
turbines_df %>% select(turbine_capacity:commissioning_date) %>% pivot_longer(rotor_diameter_m:commissioning_date) %>% ggplot(aes(turbine_capacity, value)) + geom_hex(bins = 15, alpha = 0.8) + geom_smooth(method = "lm") + facet_wrap(~name, scales = "free_y") + labs(y = NULL) + scale_fill_gradient(high = "cyan3")## `geom_smooth()` using formula 'y ~ x'

建立模型
library(tidymodels)## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --## v broom 0.7.6 v rsample 0.0.9 ## v dials 0.0.9 v tune 0.1.5 ## v infer 0.5.4 v workflows 0.2.2 ## v modeldata 0.1.0 v workflowsets 0.0.2 ## v parsnip 0.1.5 v yardstick 0.0.8 ## v recipes 0.1.16## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --## x scales::discard() masks purrr::discard()## x dplyr::filter() masks stats::filter()## x recipes::fixed() masks stringr::fixed()## x dplyr::lag() masks stats::lag()## x yardstick::spec() masks readr::spec()## x recipes::step() masks stats::step()## * Use tidymodels_prefer() to resolve common conflicts.turbines_df <- turbines_df[1:300,]set.seed(123)wind_split <- initial_split(turbines_df, strata = turbine_capacity)wind_train <- training(wind_split)wind_test <- testing(wind_split)set.seed(234)wind_folds <- vfold_cv(wind_train, strata = turbine_capacity)wind_folds## # 10-fold cross-validation using stratification ## # A tibble: 10 x 2## splits id ## <list> <chr> ## 1 <split [200/25]> Fold01## 2 <split [202/23]> Fold02## 3 <split [202/23]> Fold03## 4 <split [203/22]> Fold04## 5 <split [203/22]> Fold05## 6 <split [203/22]> Fold06## 7 <split [203/22]> Fold07## 8 <split [203/22]> Fold08## 9 <split [203/22]> Fold09## 10 <split [203/22]> Fold10
tree_spec <- decision_tree( cost_complexity = tune(), tree_depth = tune(), min_n = tune()) %>% set_engine("rpart") %>% set_mode("regression")tree_spec## Decision Tree Model Specification (regression)## ## Main Arguments:## cost_complexity = tune()## tree_depth = tune()## min_n = tune()## ## Computational engine: rpart
tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)tree_grid## # A tibble: 64 x 3## cost_complexity tree_depth min_n## <dbl> <int> <int>## 1 0.0000000001 1 2## 2 0.0000001 1 2## 3 0.0001 1 2## 4 0.1 1 2## 5 0.0000000001 5 2## 6 0.0000001 5 2## 7 0.0001 5 2## 8 0.1 5 2## 9 0.0000000001 10 2## 10 0.0000001 10 2## # ... with 54 more rows
doParallel::registerDoParallel()set.seed(345)tree_rs <- tune_grid( tree_spec, turbine_capacity ~ ., resamples = wind_folds, grid = tree_grid, metrics = metric_set(rmse, rsq, mae, mape))tree_rs## # Tuning results## # 10-fold cross-validation using stratification ## # A tibble: 10 x 4## splits id .metrics .notes ## <list> <chr> <list> <list> ## 1 <split [200/25]> Fold01 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 2 <split [202/23]> Fold02 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 3 <split [202/23]> Fold03 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 4 <split [203/22]> Fold04 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 5 <split [203/22]> Fold05 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 6 <split [203/22]> Fold06 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 7 <split [203/22]> Fold07 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 8 <split [203/22]> Fold08 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 9 <split [203/22]> Fold09 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>## 10 <split [203/22]> Fold10 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
评价模型
collect_metrics(tree_rs)## # A tibble: 256 x 9## cost_complexity tree_depth min_n .metric .estimator mean n std_err## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>## 1 0.0000000001 1 2 mae standard 224. 10 19.4 ## 2 0.0000000001 1 2 mape standard 13.3 10 1.56 ## 3 0.0000000001 1 2 rmse standard 395. 10 33.7 ## 4 0.0000000001 1 2 rsq standard 0.683 10 0.0308## 5 0.0000001 1 2 mae standard 224. 10 19.4 ## 6 0.0000001 1 2 mape standard 13.3 10 1.56 ## 7 0.0000001 1 2 rmse standard 395. 10 33.7 ## 8 0.0000001 1 2 rsq standard 0.683 10 0.0308## 9 0.0001 1 2 mae standard 224. 10 19.4 ## 10 0.0001 1 2 mape standard 13.3 10 1.56 ## # ... with 246 more rows, and 1 more variable: .config <chr>
autoplot(tree_rs) + theme_light(base_family = "IBMPlexSans")

show_best(tree_rs, "mape")## # A tibble: 5 x 9## cost_complexity tree_depth min_n .metric .estimator mean n std_err## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>## 1 0.0000000001 5 2 mape standard 1.37 10 1.19## 2 0.0000001 5 2 mape standard 1.37 10 1.19## 3 0.0001 5 2 mape standard 1.37 10 1.19## 4 0.0000000001 10 2 mape standard 1.37 10 1.19## 5 0.0000001 10 2 mape standard 1.37 10 1.19## # ... with 1 more variable: .config <chr>
select_best(tree_rs, "rmse")## # A tibble: 1 x 4## cost_complexity tree_depth min_n .config ## <dbl> <int> <int> <chr> ## 1 0.0000000001 5 2 Preprocessor1_Model05
final_tree <- finalize_model(tree_spec, select_best(tree_rs, "rmse"))final_tree## Decision Tree Model Specification (regression)## ## Main Arguments:## cost_complexity = 1e-10## tree_depth = 5## min_n = 2## ## Computational engine: rpart
final_fit <- fit(final_tree, turbine_capacity ~ ., wind_train)final_rs <- last_fit(final_tree, turbine_capacity ~ ., wind_split)
predict(final_fit, wind_train[100, ])## # A tibble: 1 x 1## .pred## <dbl>## 1 660
predict(final_rs$.workflow[[1]], wind_train[100, ])## # A tibble: 1 x 1## .pred## <dbl>## 1 660
library(vip)## ## Attaching package: 'vip'## The following object is masked from 'package:utils':## ## vifinal_fit %>% vip(geom = "col", aesthetics = list(fill = "midnightblue", alpha = 0.8)) + scale_y_continuous(expand = c(0, 0))

library(parttree)ex_fit <- fit( final_tree, turbine_capacity ~ rotor_diameter_m + commissioning_date, wind_train)wind_train %>% ggplot(aes(rotor_diameter_m, commissioning_date)) + geom_parttree(data = ex_fit, aes(fill = turbine_capacity), alpha = 0.3) + geom_jitter(alpha = 0.7, width = 1, height = 0.5, aes(color = turbine_capacity)) + scale_colour_viridis_c(aesthetics = c("color", "fill"))

collect_metrics(final_rs)## # A tibble: 2 x 4## .metric .estimator .estimate .config ## <chr> <chr> <dbl> <chr> ## 1 rmse standard 10.4 Preprocessor1_Model1## 2 rsq standard 1.00 Preprocessor1_Model1
final_rs %>% collect_predictions() %>% ggplot(aes(turbine_capacity, .pred)) + geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) + geom_point(alpha = 0.6, color = "midnightblue") + coord_fixed()
