tidymodels - tidymodels-exercise-06 - 《R语言机器学习》

数据探索
建立模型
结果

数据探索

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
ikea <- read_csv("../datasets/tidytuesday/data/2020/2020-11-03/ikea.csv")
## 
## -- Column specification ----------------------------------------------------------------------------
## cols(
##   X1 = col_double(),
##   item_id = col_double(),
##   name = col_character(),
##   category = col_character(),
##   price = col_double(),
##   old_price = col_character(),
##   sellable_online = col_logical(),
##   link = col_character(),
##   other_colors = col_character(),
##   short_description = col_character(),
##   designer = col_character(),
##   depth = col_double(),
##   height = col_double(),
##   width = col_double()
## )
ikea
## # A tibble: 3,694 x 14
##       X1  item_id name    category price old_price sellable_online link         
##    <dbl>    <dbl> <chr>   <chr>    <dbl> <chr>     <lgl>           <chr>        
##  1     0 90420332 FREKVE~ Bar fur~   265 No old p~ TRUE            https://www.~
##  2     1   368814 NORDVI~ Bar fur~   995 No old p~ FALSE           https://www.~
##  3     2  9333523 NORDVI~ Bar fur~  2095 No old p~ FALSE           https://www.~
##  4     3 80155205 STIG    Bar fur~    69 No old p~ TRUE            https://www.~
##  5     4 30180504 NORBERG Bar fur~   225 No old p~ TRUE            https://www.~
##  6     5 10122647 INGOLF  Bar fur~   345 No old p~ TRUE            https://www.~
##  7     6 70404875 FRANKL~ Bar fur~   129 No old p~ TRUE            https://www.~
##  8     7 60155602 DALFRED Bar fur~   195 No old p~ TRUE            https://www.~
##  9     8 50406465 FRANKL~ Bar fur~   129 No old p~ TRUE            https://www.~
## 10     9 69304221 EKEDAL~ Bar fur~  2176 SR 2,375  TRUE            https://www.~
## # ... with 3,684 more rows, and 6 more variables: other_colors <chr>,
## #   short_description <chr>, designer <chr>, depth <dbl>, height <dbl>,
## #   width <dbl>

ikea %>%
  select(X1, price, depth:width) %>%
  pivot_longer(depth:width, names_to = "dim") %>%
  ggplot(aes(value, price, color = dim)) +
  geom_point(alpha = 0.4, show.legend = FALSE) +
  scale_y_log10() +
  facet_wrap(~dim, scales = "free_x") +
  labs(x = NULL)

tidymodels-exercise-06 - 图1

ikea_df <- ikea %>%
  select(price, name, category, depth, height, width) %>%
  mutate(price = log10(price)) %>%
  mutate_if(is.character, factor) 
ikea_df
## # A tibble: 3,694 x 6
##    price name                  category      depth height width
##    <dbl> <fct>                 <fct>         <dbl>  <dbl> <dbl>
##  1  2.42 FREKVENS              Bar furniture    NA     99    51
##  2  3.00 NORDVIKEN             Bar furniture    NA    105    80
##  3  3.32 NORDVIKEN / NORDVIKEN Bar furniture    NA     NA    NA
##  4  1.84 STIG                  Bar furniture    50    100    60
##  5  2.35 NORBERG               Bar furniture    60     43    74
##  6  2.54 INGOLF                Bar furniture    45     91    40
##  7  2.11 FRANKLIN              Bar furniture    44     95    50
##  8  2.29 DALFRED               Bar furniture    50     NA    50
##  9  2.11 FRANKLIN              Bar furniture    44     95    50
## 10  3.34 EKEDALEN / EKEDALEN   Bar furniture    NA     NA    NA
## # ... with 3,684 more rows

建立模型

library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
## v broom        0.7.6      v rsample      0.0.9 
## v dials        0.0.9      v tune         0.1.5 
## v infer        0.5.4      v workflows    0.2.2 
## v modeldata    0.1.0      v workflowsets 0.0.2 
## v parsnip      0.1.5      v yardstick    0.0.8 
## v recipes      0.1.16
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
ikea_df <- ikea_df[1:100,]
set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)
set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price,times = 5)
ikea_folds
## # Bootstrap sampling using stratification 
## # A tibble: 5 x 2
##   splits          id        
##   <list>          <chr>     
## 1 <split [76/28]> Bootstrap1
## 2 <split [76/29]> Bootstrap2
## 3 <split [76/26]> Bootstrap3
## 4 <split [76/27]> Bootstrap4
## 5 <split [76/27]> Bootstrap5

library(usemodels)
use_ranger(price ~ ., data = ikea_train)
## ranger_recipe <- 
##   recipe(formula = price ~ ., data = ikea_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("regression") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(31129)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

library(textrecipes)
ranger_recipe <-
  recipe(formula = price ~ ., data = ikea_train) %>%
  step_other(name, category, threshold = 0.01) %>%
  #step_clean_levels(name, category) %>%
  step_impute_knn(depth, height, width)
ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_mode("regression") %>%
  set_engine("ranger",importance = "permutation")
ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)
set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <- tune_grid(
  ranger_workflow,
  resamples = ikea_folds,
  grid = 5
  )
## i Creating pre-processing data to finalize unknown parameter: mtry

ranger_tune
## # Tuning results
## # Bootstrap sampling using stratification 
## # A tibble: 5 x 4
##   splits          id         .metrics              .notes              
##   <list>          <chr>      <list>                <list>              
## 1 <split [76/28]> Bootstrap1 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 2 <split [76/29]> Bootstrap2 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 3 <split [76/26]> Bootstrap3 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 4 <split [76/27]> Bootstrap4 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 5 <split [76/27]> Bootstrap5 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>

结果

show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1     5     9 rmse    standard   0.329     5  0.0123 Preprocessor1_Model1
## 2     4    31 rmse    standard   0.331     5  0.0112 Preprocessor1_Model4
## 3     3    24 rmse    standard   0.332     5  0.0109 Preprocessor1_Model5
## 4     2    37 rmse    standard   0.333     5  0.0127 Preprocessor1_Model2
## 5     3    13 rmse    standard   0.334     5  0.0114 Preprocessor1_Model3

show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1     5     9 rsq     standard   0.316     5  0.0421 Preprocessor1_Model1
## 2     3    13 rsq     standard   0.280     5  0.0452 Preprocessor1_Model3
## 3     4    31 rsq     standard   0.264     5  0.0473 Preprocessor1_Model4
## 4     3    24 rsq     standard   0.261     5  0.0464 Preprocessor1_Model5
## 5     2    37 rsq     standard   0.221     5  0.0503 Preprocessor1_Model2

autoplot(ranger_tune)

tidymodels-exercise-06 - 图2

把训练好的参数放进模型中

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune, metric = "rmse"))
final_rf
## == Workflow ========================================================================================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor ------------------------------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_other()
## * step_impute_knn()
## 
## -- Model -------------------------------------------------------------------------------------------
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 5
##   trees = 1000
##   min_n = 9
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: ranger

lase_fit使用训练好的参数重新作用于训练集！！并且！！同时获得测试集的效果

ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit # 这个`last_fit`使用测试集得出来的
## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits      id          .metrics       .notes       .predictions     .workflow
##   <list>      <chr>       <list>         <list>       <list>           <list>   
## 1 <split [76~ train/test~ <tibble[,4] [~ <tibble[,1]~ <tibble[,4] [24~ <workflo~

# 这2个都是基于测试集得出的结果
collect_metrics(ikea_fit)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.310 Preprocessor1_Model1
## 2 rsq     standard       0.294 Preprocessor1_Model1
collect_predictions(ikea_fit)
## # A tibble: 24 x 5
##    id               .pred  .row price .config             
##    <chr>            <dbl> <int> <dbl> <chr>               
##  1 train/test split  2.38     5  2.35 Preprocessor1_Model1
##  2 train/test split  2.27    15  2.54 Preprocessor1_Model1
##  3 train/test split  2.95    21  3.00 Preprocessor1_Model1
##  4 train/test split  2.54    23  2.77 Preprocessor1_Model1
##  5 train/test split  2.27    25  2.54 Preprocessor1_Model1
##  6 train/test split  2.27    26  2.54 Preprocessor1_Model1
##  7 train/test split  3.16    31  2.77 Preprocessor1_Model1
##  8 train/test split  2.61    32  2.64 Preprocessor1_Model1
##  9 train/test split  3.25    40  3.34 Preprocessor1_Model1
## 10 train/test split  2.44    41  2.87 Preprocessor1_Model1
## # ... with 14 more rows

collect_predictions(ikea_fit) %>%
  ggplot(aes(price, .pred)) +
  geom_abline(lty = 1, color = "red",lwd=2) +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

tidymodels-exercise-06 - 图3

现在可以使用训练好的ikea_fit预测其他的数据了

predict(ikea_fit$.workflow[[1]], ikea_test[10, ])
## # A tibble: 1 x 1
##   .pred
##   <dbl>
## 1  2.44

使用vip包看看重要性，需要importance这个参数

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")
workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(ikea_train) %>%
  pull_workflow_fit() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

tidymodels-exercise-06 - 图4

finalize_model，这里有一个问题，因为原数据是有缺失值的，如果不使用workflow，缺失值就不会被填补，后面fit和last_fit有可能会出错，基于树的模型是不能处理有缺失值的数据的

final_range已经使用了训练好的参数，但是还没有fit过，可以使用一下2种方法fit

现在可以预测新数据了