数据探索
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
turbines <- read_csv("../datasets/tidytuesday/data/2020/2020-10-27/wind-turbine.csv")
##
## -- Column specification ----------------------------------------------------------------------------
## cols(
## objectid = col_double(),
## province_territory = col_character(),
## project_name = col_character(),
## total_project_capacity_mw = col_double(),
## turbine_identifier = col_character(),
## turbine_number_in_project = col_character(),
## turbine_rated_capacity_k_w = col_double(),
## rotor_diameter_m = col_double(),
## hub_height_m = col_double(),
## manufacturer = col_character(),
## model = col_character(),
## commissioning_date = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## notes = col_character()
## )
turbines
## # A tibble: 6,698 x 15
## objectid province_territo~ project_name total_project_ca~ turbine_identif~
## <dbl> <chr> <chr> <dbl> <chr>
## 1 1 Alberta Optimist Wind ~ 0.9 OWE1
## 2 2 Alberta Castle River W~ 44 CRW1
## 3 3 Alberta Waterton Wind ~ 3.78 WWT1
## 4 4 Alberta Waterton Wind ~ 3.78 WWT2
## 5 5 Alberta Waterton Wind ~ 3.78 WWT3
## 6 6 Alberta Waterton Wind ~ 3.78 WWT4
## 7 7 Alberta Cowley North 19.5 CON1
## 8 8 Alberta Cowley North 19.5 CON2
## 9 9 Alberta Cowley North 19.5 CON3
## 10 10 Alberta Cowley North 19.5 CON4
## # ... with 6,688 more rows, and 10 more variables:
## # turbine_number_in_project <chr>, turbine_rated_capacity_k_w <dbl>,
## # rotor_diameter_m <dbl>, hub_height_m <dbl>, manufacturer <chr>,
## # model <chr>, commissioning_date <chr>, latitude <dbl>, longitude <dbl>,
## # notes <chr>
turbines_df <- turbines %>%
transmute(
turbine_capacity = turbine_rated_capacity_k_w,
rotor_diameter_m,
hub_height_m,
commissioning_date = parse_number(commissioning_date),
province_territory = fct_lump_n(province_territory, 10),
model = fct_lump_n(model, 10)
) %>%
filter(!is.na(turbine_capacity)) %>%
mutate_if(is.character, factor)
turbines_df %>%
select(turbine_capacity:commissioning_date) %>%
pivot_longer(rotor_diameter_m:commissioning_date) %>%
ggplot(aes(turbine_capacity, value)) +
geom_hex(bins = 15, alpha = 0.8) +
geom_smooth(method = "lm") +
facet_wrap(~name, scales = "free_y") +
labs(y = NULL) +
scale_fill_gradient(high = "cyan3")
## `geom_smooth()` using formula 'y ~ x'

建立模型
library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
## v broom 0.7.6 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.5
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v workflowsets 0.0.2
## v parsnip 0.1.5 v yardstick 0.0.8
## v recipes 0.1.16
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
turbines_df <- turbines_df[1:300,]
set.seed(123)
wind_split <- initial_split(turbines_df, strata = turbine_capacity)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)
set.seed(234)
wind_folds <- vfold_cv(wind_train, strata = turbine_capacity)
wind_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [200/25]> Fold01
## 2 <split [202/23]> Fold02
## 3 <split [202/23]> Fold03
## 4 <split [203/22]> Fold04
## 5 <split [203/22]> Fold05
## 6 <split [203/22]> Fold06
## 7 <split [203/22]> Fold07
## 8 <split [203/22]> Fold08
## 9 <split [203/22]> Fold09
## 10 <split [203/22]> Fold10
tree_spec <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart") %>%
set_mode("regression")
tree_spec
## Decision Tree Model Specification (regression)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
tree_grid
## # A tibble: 64 x 3
## cost_complexity tree_depth min_n
## <dbl> <int> <int>
## 1 0.0000000001 1 2
## 2 0.0000001 1 2
## 3 0.0001 1 2
## 4 0.1 1 2
## 5 0.0000000001 5 2
## 6 0.0000001 5 2
## 7 0.0001 5 2
## 8 0.1 5 2
## 9 0.0000000001 10 2
## 10 0.0000001 10 2
## # ... with 54 more rows
doParallel::registerDoParallel()
set.seed(345)
tree_rs <- tune_grid(
tree_spec,
turbine_capacity ~ .,
resamples = wind_folds,
grid = tree_grid,
metrics = metric_set(rmse, rsq, mae, mape)
)
tree_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [200/25]> Fold01 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 2 <split [202/23]> Fold02 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 3 <split [202/23]> Fold03 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 4 <split [203/22]> Fold04 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 5 <split [203/22]> Fold05 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 6 <split [203/22]> Fold06 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 7 <split [203/22]> Fold07 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 8 <split [203/22]> Fold08 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 9 <split [203/22]> Fold09 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
## 10 <split [203/22]> Fold10 <tibble[,7] [256 x 7]> <tibble[,1] [0 x 1]>
评价模型
collect_metrics(tree_rs)
## # A tibble: 256 x 9
## cost_complexity tree_depth min_n .metric .estimator mean n std_err
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
## 1 0.0000000001 1 2 mae standard 224. 10 19.4
## 2 0.0000000001 1 2 mape standard 13.3 10 1.56
## 3 0.0000000001 1 2 rmse standard 395. 10 33.7
## 4 0.0000000001 1 2 rsq standard 0.683 10 0.0308
## 5 0.0000001 1 2 mae standard 224. 10 19.4
## 6 0.0000001 1 2 mape standard 13.3 10 1.56
## 7 0.0000001 1 2 rmse standard 395. 10 33.7
## 8 0.0000001 1 2 rsq standard 0.683 10 0.0308
## 9 0.0001 1 2 mae standard 224. 10 19.4
## 10 0.0001 1 2 mape standard 13.3 10 1.56
## # ... with 246 more rows, and 1 more variable: .config <chr>
autoplot(tree_rs) + theme_light(base_family = "IBMPlexSans")

show_best(tree_rs, "mape")
## # A tibble: 5 x 9
## cost_complexity tree_depth min_n .metric .estimator mean n std_err
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
## 1 0.0000000001 5 2 mape standard 1.37 10 1.19
## 2 0.0000001 5 2 mape standard 1.37 10 1.19
## 3 0.0001 5 2 mape standard 1.37 10 1.19
## 4 0.0000000001 10 2 mape standard 1.37 10 1.19
## 5 0.0000001 10 2 mape standard 1.37 10 1.19
## # ... with 1 more variable: .config <chr>
select_best(tree_rs, "rmse")
## # A tibble: 1 x 4
## cost_complexity tree_depth min_n .config
## <dbl> <int> <int> <chr>
## 1 0.0000000001 5 2 Preprocessor1_Model05
final_tree <- finalize_model(tree_spec, select_best(tree_rs, "rmse"))
final_tree
## Decision Tree Model Specification (regression)
##
## Main Arguments:
## cost_complexity = 1e-10
## tree_depth = 5
## min_n = 2
##
## Computational engine: rpart
final_fit <- fit(final_tree, turbine_capacity ~ ., wind_train)
final_rs <- last_fit(final_tree, turbine_capacity ~ ., wind_split)
predict(final_fit, wind_train[100, ])
## # A tibble: 1 x 1
## .pred
## <dbl>
## 1 660
predict(final_rs$.workflow[[1]], wind_train[100, ])
## # A tibble: 1 x 1
## .pred
## <dbl>
## 1 660
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_fit %>%
vip(geom = "col", aesthetics = list(fill = "midnightblue", alpha = 0.8)) +
scale_y_continuous(expand = c(0, 0))

library(parttree)
ex_fit <- fit(
final_tree,
turbine_capacity ~ rotor_diameter_m + commissioning_date,
wind_train
)
wind_train %>%
ggplot(aes(rotor_diameter_m, commissioning_date)) +
geom_parttree(data = ex_fit, aes(fill = turbine_capacity), alpha = 0.3) +
geom_jitter(alpha = 0.7, width = 1, height = 0.5, aes(color = turbine_capacity)) +
scale_colour_viridis_c(aesthetics = c("color", "fill"))

collect_metrics(final_rs)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 10.4 Preprocessor1_Model1
## 2 rsq standard 1.00 Preprocessor1_Model1
final_rs %>%
collect_predictions() %>%
ggplot(aes(turbine_capacity, .pred)) +
geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "midnightblue") +
coord_fixed()
