tidymodels - tidymodels-exercise-02 - 《R语言机器学习》

数据探索
建模
评价模型

数据探索

children是结果变量，是一个2分类变量。

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
hotels <- readr::read_csv("../datasets/tidytuesday/data/2020/2020-02-11/hotels.csv")
## 
## -- Column specification ----------------------------------------------------------------------------
## cols(
##   .default = col_double(),
##   hotel = col_character(),
##   arrival_date_month = col_character(),
##   meal = col_character(),
##   country = col_character(),
##   market_segment = col_character(),
##   distribution_channel = col_character(),
##   reserved_room_type = col_character(),
##   assigned_room_type = col_character(),
##   deposit_type = col_character(),
##   agent = col_character(),
##   company = col_character(),
##   customer_type = col_character(),
##   reservation_status = col_character(),
##   reservation_status_date = col_date(format = "")
## )
## i<U+00A0>Use `spec()` for the full column specifications.
hotel_stays <- hotels %>%
  filter(is_canceled == 0) %>%
  mutate(
    children = case_when( # 学习下case_when函数
      children + babies > 0 ~ "children",
      TRUE ~ "none"
    ),
    required_car_parking_spaces = case_when(
      required_car_parking_spaces > 0 ~ "parking",
      TRUE ~ "none"
    )
  ) %>%
  select(-is_canceled, -reservation_status, -babies)
hotel_stays
## # A tibble: 75,166 x 29
##    hotel      lead_time arrival_date_year arrival_date_mon~ arrival_date_week_n~
##    <chr>          <dbl>             <dbl> <chr>                            <dbl>
##  1 Resort Ho~       342              2015 July                                27
##  2 Resort Ho~       737              2015 July                                27
##  3 Resort Ho~         7              2015 July                                27
##  4 Resort Ho~        13              2015 July                                27
##  5 Resort Ho~        14              2015 July                                27
##  6 Resort Ho~        14              2015 July                                27
##  7 Resort Ho~         0              2015 July                                27
##  8 Resort Ho~         9              2015 July                                27
##  9 Resort Ho~        35              2015 July                                27
## 10 Resort Ho~        68              2015 July                                27
## # ... with 75,156 more rows, and 24 more variables:
## #   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
## #   stays_in_week_nights <dbl>, adults <dbl>, children <chr>, meal <chr>,
## #   country <chr>, market_segment <chr>, distribution_channel <chr>,
## #   is_repeated_guest <dbl>, previous_cancellations <dbl>,
## #   previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
## #   assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
## #   agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## #   customer_type <chr>, adr <dbl>, required_car_parking_spaces <chr>,
## #   total_of_special_requests <dbl>, reservation_status_date <date>

hotel_stays %>% count(children) # children是结果变量
## # A tibble: 2 x 2
##   children     n
##   <chr>    <int>
## 1 children  6073
## 2 none     69093

skimr::skim(hotel_stays)

Table: Data summary


Name	hotel_stays
Number of rows	75166
Number of columns	29
_
Column type frequency:
character	14
Date	1
numeric	14
__
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
hotel	1	10	12	2
arrival_date_month	1	3	9	12
children	1	4	8	2
meal	1	2	9	5
country	1	2	4	166
market_segment	1	6	13	7
distribution_channel	1	3	9	5
reserved_room_type	1	1	1	9
assigned_room_type	1	1	1	10
deposit_type	1	10	10	3
agent	1	1	4	315
company	1	1	4	332
customer_type	1	5	15	4
required_car_parking_spaces	1	4	7	2

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2015-07-01	2017-09-14	2016-09-01	805

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lead_time	1	79.98	91.11	0.00	9.0	45.0	124	737	▇▂▁▁▁
arrival_date_year	1	2016.15	0.70	2015.00	2016.0	2016.0	2017	2017	▃▁▇▁▆
arrival_date_week_number	1	27.08	13.90	1.00	16.0	28.0	38	53	▆▇▇▇▆
arrival_date_day_of_month	1	15.84	8.78	1.00	8.0	16.0	23	31	▇▇▇▇▆
stays_in_weekend_nights	1	0.93	0.99	0.00	0.0	1.0	2	19	▇▁▁▁▁
stays_in_week_nights	1	2.46	1.92	0.00	1.0	2.0	3	50	▇▁▁▁▁
adults	1	1.83	0.51	0.00	2.0	2.0	2	4	▁▂▇▁▁
is_repeated_guest	1	0.04	0.20	0.00	0.0	0.0	0	1	▇▁▁▁▁
previous_cancellations	1	0.02	0.27	0.00	0.0	0.0	0	13	▇▁▁▁▁
previous_bookings_not_canceled	1	0.20	1.81	0.00	0.0	0.0	0	72	▇▁▁▁▁
booking_changes	1	0.29	0.74	0.00	0.0	0.0	0	21	▇▁▁▁▁
days_in_waiting_list	1	1.59	14.78	0.00	0.0	0.0	0	379	▇▁▁▁▁
adr	1	99.99	49.21	-6.38	67.5	92.5	125	510	▇▆▁▁▁
total_of_special_requests	1	0.71	0.83	0.00	0.0	1.0	1	5	▇▁▁▁▁

# 写出这么刘畅的代码取决于对数据的理解以及对tidyverse包的理解
hotel_stays %>%
  mutate(arrival_date_month = factor(arrival_date_month, levels = month.name)) %>%
  count(hotel, arrival_date_month, children) %>%
  group_by(hotel, children) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(arrival_date_month, proportion, fill = children)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) + # 学习下百分比的用法
  facet_wrap(~hotel, nrow = 2) +
  labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

tidymodels-exercise-02 - 图1

hotel_stays %>%
  count(hotel, required_car_parking_spaces, children) %>%
  group_by(hotel, children) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(required_car_parking_spaces, proportion, fill = children)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

tidymodels-exercise-02 - 图2

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
hotel_stays %>%
  select(
    children, adr,
    required_car_parking_spaces,
    total_of_special_requests
  ) %>%
  ggpairs(mapping = aes(color = children))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tidymodels-exercise-02 - 图3

建模

数据准备

hotels_df <- hotel_stays %>%
  select(
    children, hotel, arrival_date_month, meal, adr, adults,
    required_car_parking_spaces, total_of_special_requests,
    stays_in_week_nights, stays_in_weekend_nights
  ) %>%
  mutate_if(is.character, factor)
hotels_df <- hotels_df[1:1000,]

library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
## v broom        0.7.6      v rsample      0.0.9 
## v dials        0.0.9      v tune         0.1.5 
## v infer        0.5.4      v workflows    0.2.2 
## v modeldata    0.1.0      v workflowsets 0.0.2 
## v parsnip      0.1.5      v yardstick    0.0.8 
## v recipes      0.1.16
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
tidymodels_prefer()
set.seed(12)
hotel_split <- hotels_df %>% 
  initial_split()
hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)
hotel_rec <- recipe(children ~ ., data = hotel_train) %>%
  themis::step_downsample(children) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric()) %>%
  step_normalize(all_numeric()) %>%
  prep()
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
hotel_rec
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          9
## 
## Training data contained 750 data points and no missing data.
## 
## Operations:
## 
## Down-sampling based on children [trained]
## Dummy variables from hotel, arrival_date_month, ... [trained]
## Zero variance filter removed 11 items [trained]
## Centering and scaling for adr, adults, ... [trained]

# 提取准备好的训练集和测试集
train_proc <- bake(hotel_rec, new_data = NULL)
test_proc <- bake(hotel_rec, new_data = hotel_test)

knn

knn_spec <- nearest_neighbor(mode = "classification") %>% set_engine("kknn")
knn_fit <- knn_spec %>% fit(children ~ ., data = train_proc)
knn_fit
## parsnip model object
## 
## Fit time:  11ms 
## 
## Call:
## kknn::train.kknn(formula = children ~ ., data = data, ks = min_rows(5,     data, 5))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.3482143
## Best kernel: optimal
## Best k: 5

随机森林

rf_spec <- rand_forest(mode = "classification") %>% set_engine("ranger")
rf_fit <- rf_spec %>% fit(children ~ ., data = train_proc)
rf_fit
## parsnip model object
## 
## Fit time:  80ms 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      224 
## Number of independent variables:  11 
## Mtry:                             3 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1935183

决策树

tree_spec <- decision_tree(mode = "classification") %>% set_engine("rpart")
tree_fit <- tree_spec %>% fit(children ~ ., data = train_proc)
tree_fit
## parsnip model object
## 
## Fit time:  0ms 
## n= 224 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 224 112 children (0.50000000 0.50000000)  
##     2) adr>=0.5847452 63  11 children (0.82539683 0.17460317) *
##     3) adr< 0.5847452 161  60 none (0.37267081 0.62732919)  
##       6) total_of_special_requests>=0.472056 43  15 children (0.65116279 0.34883721)  
##        12) meal_HB>=0.5359649 13   2 children (0.84615385 0.15384615) *
##        13) meal_HB< 0.5359649 30  13 children (0.56666667 0.43333333)  
##          26) adr< -0.266918 21   6 children (0.71428571 0.28571429) *
##          27) adr>=-0.266918 9   2 none (0.22222222 0.77777778) *
##       7) total_of_special_requests< 0.472056 118  32 none (0.27118644 0.72881356)  
##        14) adr>=-0.7217095 76  29 none (0.38157895 0.61842105)  
##          28) adr< -0.53929 8   2 children (0.75000000 0.25000000) *
##          29) adr>=-0.53929 68  23 none (0.33823529 0.66176471)  
##            58) adr>=-0.3662756 57  23 none (0.40350877 0.59649123)  
##             116) adr< 0.2846238 38  19 children (0.50000000 0.50000000)  
##               232) stays_in_weekend_nights>=-0.8218309 24   9 children (0.62500000 0.37500000) *
##               233) stays_in_weekend_nights< -0.8218309 14   4 none (0.28571429 0.71428571) *
##             117) adr>=0.2846238 19   4 none (0.21052632 0.78947368) *
##            59) adr< -0.3662756 11   0 none (0.00000000 1.00000000) *
##        15) adr< -0.7217095 42   3 none (0.07142857 0.92857143) *

评价模型

set.seed(1234)
# 模特卡洛法交叉验证
validation_split <- mc_cv(data = train_proc, prop = 0.9, strata = children)
validation_split
## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples  using stratification 
## # A tibble: 25 x 2
##    splits           id        
##    <list>           <chr>     
##  1 <split [202/22]> Resample01
##  2 <split [202/22]> Resample02
##  3 <split [202/22]> Resample03
##  4 <split [202/22]> Resample04
##  5 <split [202/22]> Resample05
##  6 <split [202/22]> Resample06
##  7 <split [202/22]> Resample07
##  8 <split [202/22]> Resample08
##  9 <split [202/22]> Resample09
## 10 <split [202/22]> Resample10
## # ... with 15 more rows
validation_split$splits[[1]] %>% analysis() %>% dim()
## [1] 202  12
validation_split$splits[[1]] %>% assessment()
## # A tibble: 22 x 12
##       adr  adults total_of_special_~ stays_in_week_n~ stays_in_weekend~ children
##     <dbl>   <dbl>              <dbl>            <dbl>             <dbl> <fct>   
##  1 -0.909 -0.0643             0.957             0.675             0.671 children
##  2 -0.961 -2.94               0.957             0.193             0.671 children
##  3 -1.16  -0.0643             0.957            -0.288             0.671 children
##  4  0.837  2.82              -0.0130            0.193            -1.32  children
##  5 -0.698 -0.0643            -0.983             1.64              2.66  children
##  6  0.837 -0.0643            -0.0130           -1.73              0.671 children
##  7 -0.232 -0.0643            -0.0130           -0.770            -1.32  children
##  8  1.23  -0.0643            -0.983             0.675             2.66  children
##  9  1.09  -0.0643            -0.983             2.12              0.671 children
## 10  0.613 -0.0643             0.957            -0.770            -1.32  children
## # ... with 12 more rows, and 6 more variables: arrival_date_month_August <dbl>,
## #   arrival_date_month_July <dbl>, arrival_date_month_September <dbl>,
## #   meal_FB <dbl>, meal_HB <dbl>, required_car_parking_spaces_parking <dbl>

knn_res <- fit_resamples(
  knn_spec,
  children ~ ., 
  validation_split,
  control = control_resamples(save_pred = T)
)
knn_res
## # Resampling results
## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples  using stratification 
## # A tibble: 25 x 5
##    splits         id        .metrics         .notes           .predictions      
##    <list>         <chr>     <list>           <list>           <list>            
##  1 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  2 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  3 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  4 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  5 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  6 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  7 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  8 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
##  9 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 10 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## # ... with 15 more rows

knn_res %>% collect_metrics()
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.649    25  0.0230 Preprocessor1_Model1
## 2 roc_auc  binary     0.646    25  0.0270 Preprocessor1_Model1
knn_res %>% collect_metrics(summarise=FALSE)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.649    25  0.0230 Preprocessor1_Model1
## 2 roc_auc  binary     0.646    25  0.0270 Preprocessor1_Model1

knn_res %>% collect_predictions()
## # A tibble: 550 x 7
##    id        .pred_children .pred_none  .row .pred_class children .config       
##    <chr>              <dbl>      <dbl> <int> <fct>       <fct>    <chr>         
##  1 Resample~         0.0822     0.918      4 none        children Preprocessor1~
##  2 Resample~         0.0822     0.918      5 none        children Preprocessor1~
##  3 Resample~         0.155      0.845      9 none        children Preprocessor1~
##  4 Resample~         0.561      0.439     16 children    children Preprocessor1~
##  5 Resample~         0          1         22 none        children Preprocessor1~
##  6 Resample~         0.180      0.820     28 none        children Preprocessor1~
##  7 Resample~         0.738      0.262     38 children    children Preprocessor1~
##  8 Resample~         0.845      0.155     80 children    children Preprocessor1~
##  9 Resample~         0.975      0.0251    98 children    children Preprocessor1~
## 10 Resample~         0          1        101 none        children Preprocessor1~
## # ... with 540 more rows

rf_res <- fit_resamples(
  rf_spec,
  children ~ ., 
  validation_split,
  control = control_resamples(save_pred = T)
)
rf_res %>% collect_metrics()
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.687    25  0.0158 Preprocessor1_Model1
## 2 roc_auc  binary     0.758    25  0.0154 Preprocessor1_Model1

tree_res <- fit_resamples(
  tree_spec,
  children ~ ., 
  validation_split,
  control = control_resamples(save_pred = T)
)
tree_res %>% collect_metrics()
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.713    25  0.0148 Preprocessor1_Model1
## 2 roc_auc  binary     0.752    25  0.0185 Preprocessor1_Model1

knn_res %>%
  unnest(.predictions) %>%
  mutate(model = "kknn") %>%
  bind_rows(tree_res %>%
    unnest(.predictions) %>%
    mutate(model = "rpart")) %>%
  bind_rows(rf_res %>% 
    unnest(.predictions) %>% 
    mutate(model = "ranger")) %>% 
  group_by(model) %>%
  roc_curve(children, .pred_children) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
  geom_line(size = 1.5) +
  geom_abline(
    lty = 2, alpha = 0.5,
    color = "gray50",
    size = 1.2
  )

tidymodels-exercise-02 - 图4

随机森林最好

knn_conf <- knn_res %>% unnest(.predictions) %>% 
  conf_mat(children, .pred_class)
knn_conf
##           Truth
## Prediction children none
##   children      172   90
##   none          103  185

knn_conf %>% autoplot()

tidymodels-exercise-02 - 图5

rf_fit %>% predict(new_data = test_proc, type = "prob") %>% 
  mutate(truth = hotel_test$children) %>% 
  roc_auc(truth, .pred_children)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.798