数据探索

children是结果变量,是一个2分类变量。

  1. library(tidyverse)
  2. ## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
  3. ## v ggplot2 3.3.3 v purrr 0.3.4
  4. ## v tibble 3.1.1 v dplyr 1.0.6
  5. ## v tidyr 1.1.3 v stringr 1.4.0
  6. ## v readr 1.4.0 v forcats 0.5.1
  7. ## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
  8. ## x dplyr::filter() masks stats::filter()
  9. ## x dplyr::lag() masks stats::lag()
  10. hotels <- readr::read_csv("../datasets/tidytuesday/data/2020/2020-02-11/hotels.csv")
  11. ##
  12. ## -- Column specification ----------------------------------------------------------------------------
  13. ## cols(
  14. ## .default = col_double(),
  15. ## hotel = col_character(),
  16. ## arrival_date_month = col_character(),
  17. ## meal = col_character(),
  18. ## country = col_character(),
  19. ## market_segment = col_character(),
  20. ## distribution_channel = col_character(),
  21. ## reserved_room_type = col_character(),
  22. ## assigned_room_type = col_character(),
  23. ## deposit_type = col_character(),
  24. ## agent = col_character(),
  25. ## company = col_character(),
  26. ## customer_type = col_character(),
  27. ## reservation_status = col_character(),
  28. ## reservation_status_date = col_date(format = "")
  29. ## )
  30. ## i<U+00A0>Use `spec()` for the full column specifications.
  31. hotel_stays <- hotels %>%
  32. filter(is_canceled == 0) %>%
  33. mutate(
  34. children = case_when( # 学习下case_when函数
  35. children + babies > 0 ~ "children",
  36. TRUE ~ "none"
  37. ),
  38. required_car_parking_spaces = case_when(
  39. required_car_parking_spaces > 0 ~ "parking",
  40. TRUE ~ "none"
  41. )
  42. ) %>%
  43. select(-is_canceled, -reservation_status, -babies)
  44. hotel_stays
  45. ## # A tibble: 75,166 x 29
  46. ## hotel lead_time arrival_date_year arrival_date_mon~ arrival_date_week_n~
  47. ## <chr> <dbl> <dbl> <chr> <dbl>
  48. ## 1 Resort Ho~ 342 2015 July 27
  49. ## 2 Resort Ho~ 737 2015 July 27
  50. ## 3 Resort Ho~ 7 2015 July 27
  51. ## 4 Resort Ho~ 13 2015 July 27
  52. ## 5 Resort Ho~ 14 2015 July 27
  53. ## 6 Resort Ho~ 14 2015 July 27
  54. ## 7 Resort Ho~ 0 2015 July 27
  55. ## 8 Resort Ho~ 9 2015 July 27
  56. ## 9 Resort Ho~ 35 2015 July 27
  57. ## 10 Resort Ho~ 68 2015 July 27
  58. ## # ... with 75,156 more rows, and 24 more variables:
  59. ## # arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
  60. ## # stays_in_week_nights <dbl>, adults <dbl>, children <chr>, meal <chr>,
  61. ## # country <chr>, market_segment <chr>, distribution_channel <chr>,
  62. ## # is_repeated_guest <dbl>, previous_cancellations <dbl>,
  63. ## # previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
  64. ## # assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
  65. ## # agent <chr>, company <chr>, days_in_waiting_list <dbl>,
  66. ## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <chr>,
  67. ## # total_of_special_requests <dbl>, reservation_status_date <date>
  1. hotel_stays %>% count(children) # children是结果变量
  2. ## # A tibble: 2 x 2
  3. ## children n
  4. ## <chr> <int>
  5. ## 1 children 6073
  6. ## 2 none 69093
  1. skimr::skim(hotel_stays)

Table: Data summary

Name hotel_stays
Number of rows 75166
Number of columns 29
_
Column type frequency:
character 14
Date 1
numeric 14
__
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
children 0 1 4 8 0 2 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 166 0
market_segment 0 1 6 13 0 7 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 9 0
assigned_room_type 0 1 1 1 0 10 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 315 0
company 0 1 1 4 0 332 0
customer_type 0 1 5 15 0 4 0
required_car_parking_spaces 0 1 4 7 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2015-07-01 2017-09-14 2016-09-01 805

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lead_time 0 1 79.98 91.11 0.00 9.0 45.0 124 737 ▇▂▁▁▁
arrival_date_year 0 1 2016.15 0.70 2015.00 2016.0 2016.0 2017 2017 ▃▁▇▁▆
arrival_date_week_number 0 1 27.08 13.90 1.00 16.0 28.0 38 53 ▆▇▇▇▆
arrival_date_day_of_month 0 1 15.84 8.78 1.00 8.0 16.0 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 0.99 0.00 0.0 1.0 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.46 1.92 0.00 1.0 2.0 3 50 ▇▁▁▁▁
adults 0 1 1.83 0.51 0.00 2.0 2.0 2 4 ▁▂▇▁▁
is_repeated_guest 0 1 0.04 0.20 0.00 0.0 0.0 0 1 ▇▁▁▁▁
previous_cancellations 0 1 0.02 0.27 0.00 0.0 0.0 0 13 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.20 1.81 0.00 0.0 0.0 0 72 ▇▁▁▁▁
booking_changes 0 1 0.29 0.74 0.00 0.0 0.0 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 1.59 14.78 0.00 0.0 0.0 0 379 ▇▁▁▁▁
adr 0 1 99.99 49.21 -6.38 67.5 92.5 125 510 ▇▆▁▁▁
total_of_special_requests 0 1 0.71 0.83 0.00 0.0 1.0 1 5 ▇▁▁▁▁
  1. # 写出这么刘畅的代码取决于对数据的理解以及对tidyverse包的理解
  2. hotel_stays %>%
  3. mutate(arrival_date_month = factor(arrival_date_month, levels = month.name)) %>%
  4. count(hotel, arrival_date_month, children) %>%
  5. group_by(hotel, children) %>%
  6. mutate(proportion = n / sum(n)) %>%
  7. ggplot(aes(arrival_date_month, proportion, fill = children)) +
  8. geom_col(position = "dodge") +
  9. scale_y_continuous(labels = scales::percent_format()) + # 学习下百分比的用法
  10. facet_wrap(~hotel, nrow = 2) +
  11. labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

tidymodels-exercise-02 - 图1

  1. hotel_stays %>%
  2. count(hotel, required_car_parking_spaces, children) %>%
  3. group_by(hotel, children) %>%
  4. mutate(proportion = n / sum(n)) %>%
  5. ggplot(aes(required_car_parking_spaces, proportion, fill = children)) +
  6. geom_col(position = "dodge") +
  7. scale_y_continuous(labels = scales::percent_format()) +
  8. facet_wrap(~hotel, nrow = 2) +
  9. labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

tidymodels-exercise-02 - 图2

  1. library(GGally)
  2. ## Registered S3 method overwritten by 'GGally':
  3. ## method from
  4. ## +.gg ggplot2
  5. hotel_stays %>%
  6. select(
  7. children, adr,
  8. required_car_parking_spaces,
  9. total_of_special_requests
  10. ) %>%
  11. ggpairs(mapping = aes(color = children))
  12. ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
  13. ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
  14. ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
  15. ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tidymodels-exercise-02 - 图3

建模

数据准备

  1. hotels_df <- hotel_stays %>%
  2. select(
  3. children, hotel, arrival_date_month, meal, adr, adults,
  4. required_car_parking_spaces, total_of_special_requests,
  5. stays_in_week_nights, stays_in_weekend_nights
  6. ) %>%
  7. mutate_if(is.character, factor)
  8. hotels_df <- hotels_df[1:1000,]
  1. library(tidymodels)
  2. ## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
  3. ## v broom 0.7.6 v rsample 0.0.9
  4. ## v dials 0.0.9 v tune 0.1.5
  5. ## v infer 0.5.4 v workflows 0.2.2
  6. ## v modeldata 0.1.0 v workflowsets 0.0.2
  7. ## v parsnip 0.1.5 v yardstick 0.0.8
  8. ## v recipes 0.1.16
  9. ## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
  10. ## x scales::discard() masks purrr::discard()
  11. ## x dplyr::filter() masks stats::filter()
  12. ## x recipes::fixed() masks stringr::fixed()
  13. ## x dplyr::lag() masks stats::lag()
  14. ## x yardstick::spec() masks readr::spec()
  15. ## x recipes::step() masks stats::step()
  16. ## * Use tidymodels_prefer() to resolve common conflicts.
  17. tidymodels_prefer()
  18. set.seed(12)
  19. hotel_split <- hotels_df %>%
  20. initial_split()
  21. hotel_train <- training(hotel_split)
  22. hotel_test <- testing(hotel_split)
  23. hotel_rec <- recipe(children ~ ., data = hotel_train) %>%
  24. themis::step_downsample(children) %>%
  25. step_dummy(all_nominal(), -all_outcomes()) %>%
  26. step_zv(all_numeric()) %>%
  27. step_normalize(all_numeric()) %>%
  28. prep()
  29. ## Registered S3 methods overwritten by 'themis':
  30. ## method from
  31. ## bake.step_downsample recipes
  32. ## bake.step_upsample recipes
  33. ## prep.step_downsample recipes
  34. ## prep.step_upsample recipes
  35. ## tidy.step_downsample recipes
  36. ## tidy.step_upsample recipes
  37. ## tunable.step_downsample recipes
  38. ## tunable.step_upsample recipes
  39. hotel_rec
  40. ## Data Recipe
  41. ##
  42. ## Inputs:
  43. ##
  44. ## role #variables
  45. ## outcome 1
  46. ## predictor 9
  47. ##
  48. ## Training data contained 750 data points and no missing data.
  49. ##
  50. ## Operations:
  51. ##
  52. ## Down-sampling based on children [trained]
  53. ## Dummy variables from hotel, arrival_date_month, ... [trained]
  54. ## Zero variance filter removed 11 items [trained]
  55. ## Centering and scaling for adr, adults, ... [trained]
  1. # 提取准备好的训练集和测试集
  2. train_proc <- bake(hotel_rec, new_data = NULL)
  3. test_proc <- bake(hotel_rec, new_data = hotel_test)

knn

  1. knn_spec <- nearest_neighbor(mode = "classification") %>% set_engine("kknn")
  2. knn_fit <- knn_spec %>% fit(children ~ ., data = train_proc)
  3. knn_fit
  4. ## parsnip model object
  5. ##
  6. ## Fit time: 11ms
  7. ##
  8. ## Call:
  9. ## kknn::train.kknn(formula = children ~ ., data = data, ks = min_rows(5, data, 5))
  10. ##
  11. ## Type of response variable: nominal
  12. ## Minimal misclassification: 0.3482143
  13. ## Best kernel: optimal
  14. ## Best k: 5

随机森林

  1. rf_spec <- rand_forest(mode = "classification") %>% set_engine("ranger")
  2. rf_fit <- rf_spec %>% fit(children ~ ., data = train_proc)
  3. rf_fit
  4. ## parsnip model object
  5. ##
  6. ## Fit time: 80ms
  7. ## Ranger result
  8. ##
  9. ## Call:
  10. ## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
  11. ##
  12. ## Type: Probability estimation
  13. ## Number of trees: 500
  14. ## Sample size: 224
  15. ## Number of independent variables: 11
  16. ## Mtry: 3
  17. ## Target node size: 10
  18. ## Variable importance mode: none
  19. ## Splitrule: gini
  20. ## OOB prediction error (Brier s.): 0.1935183

决策树

  1. tree_spec <- decision_tree(mode = "classification") %>% set_engine("rpart")
  2. tree_fit <- tree_spec %>% fit(children ~ ., data = train_proc)
  3. tree_fit
  4. ## parsnip model object
  5. ##
  6. ## Fit time: 0ms
  7. ## n= 224
  8. ##
  9. ## node), split, n, loss, yval, (yprob)
  10. ## * denotes terminal node
  11. ##
  12. ## 1) root 224 112 children (0.50000000 0.50000000)
  13. ## 2) adr>=0.5847452 63 11 children (0.82539683 0.17460317) *
  14. ## 3) adr< 0.5847452 161 60 none (0.37267081 0.62732919)
  15. ## 6) total_of_special_requests>=0.472056 43 15 children (0.65116279 0.34883721)
  16. ## 12) meal_HB>=0.5359649 13 2 children (0.84615385 0.15384615) *
  17. ## 13) meal_HB< 0.5359649 30 13 children (0.56666667 0.43333333)
  18. ## 26) adr< -0.266918 21 6 children (0.71428571 0.28571429) *
  19. ## 27) adr>=-0.266918 9 2 none (0.22222222 0.77777778) *
  20. ## 7) total_of_special_requests< 0.472056 118 32 none (0.27118644 0.72881356)
  21. ## 14) adr>=-0.7217095 76 29 none (0.38157895 0.61842105)
  22. ## 28) adr< -0.53929 8 2 children (0.75000000 0.25000000) *
  23. ## 29) adr>=-0.53929 68 23 none (0.33823529 0.66176471)
  24. ## 58) adr>=-0.3662756 57 23 none (0.40350877 0.59649123)
  25. ## 116) adr< 0.2846238 38 19 children (0.50000000 0.50000000)
  26. ## 232) stays_in_weekend_nights>=-0.8218309 24 9 children (0.62500000 0.37500000) *
  27. ## 233) stays_in_weekend_nights< -0.8218309 14 4 none (0.28571429 0.71428571) *
  28. ## 117) adr>=0.2846238 19 4 none (0.21052632 0.78947368) *
  29. ## 59) adr< -0.3662756 11 0 none (0.00000000 1.00000000) *
  30. ## 15) adr< -0.7217095 42 3 none (0.07142857 0.92857143) *

评价模型

  1. set.seed(1234)
  2. # 模特卡洛法交叉验证
  3. validation_split <- mc_cv(data = train_proc, prop = 0.9, strata = children)
  4. validation_split
  5. ## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification
  6. ## # A tibble: 25 x 2
  7. ## splits id
  8. ## <list> <chr>
  9. ## 1 <split [202/22]> Resample01
  10. ## 2 <split [202/22]> Resample02
  11. ## 3 <split [202/22]> Resample03
  12. ## 4 <split [202/22]> Resample04
  13. ## 5 <split [202/22]> Resample05
  14. ## 6 <split [202/22]> Resample06
  15. ## 7 <split [202/22]> Resample07
  16. ## 8 <split [202/22]> Resample08
  17. ## 9 <split [202/22]> Resample09
  18. ## 10 <split [202/22]> Resample10
  19. ## # ... with 15 more rows
  20. validation_split$splits[[1]] %>% analysis() %>% dim()
  21. ## [1] 202 12
  22. validation_split$splits[[1]] %>% assessment()
  23. ## # A tibble: 22 x 12
  24. ## adr adults total_of_special_~ stays_in_week_n~ stays_in_weekend~ children
  25. ## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
  26. ## 1 -0.909 -0.0643 0.957 0.675 0.671 children
  27. ## 2 -0.961 -2.94 0.957 0.193 0.671 children
  28. ## 3 -1.16 -0.0643 0.957 -0.288 0.671 children
  29. ## 4 0.837 2.82 -0.0130 0.193 -1.32 children
  30. ## 5 -0.698 -0.0643 -0.983 1.64 2.66 children
  31. ## 6 0.837 -0.0643 -0.0130 -1.73 0.671 children
  32. ## 7 -0.232 -0.0643 -0.0130 -0.770 -1.32 children
  33. ## 8 1.23 -0.0643 -0.983 0.675 2.66 children
  34. ## 9 1.09 -0.0643 -0.983 2.12 0.671 children
  35. ## 10 0.613 -0.0643 0.957 -0.770 -1.32 children
  36. ## # ... with 12 more rows, and 6 more variables: arrival_date_month_August <dbl>,
  37. ## # arrival_date_month_July <dbl>, arrival_date_month_September <dbl>,
  38. ## # meal_FB <dbl>, meal_HB <dbl>, required_car_parking_spaces_parking <dbl>
  1. knn_res <- fit_resamples(
  2. knn_spec,
  3. children ~ .,
  4. validation_split,
  5. control = control_resamples(save_pred = T)
  6. )
  7. knn_res
  8. ## # Resampling results
  9. ## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification
  10. ## # A tibble: 25 x 5
  11. ## splits id .metrics .notes .predictions
  12. ## <list> <chr> <list> <list> <list>
  13. ## 1 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  14. ## 2 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  15. ## 3 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  16. ## 4 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  17. ## 5 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  18. ## 6 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  19. ## 7 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  20. ## 8 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  21. ## 9 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  22. ## 10 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
  23. ## # ... with 15 more rows
  1. knn_res %>% collect_metrics()
  2. ## # A tibble: 2 x 6
  3. ## .metric .estimator mean n std_err .config
  4. ## <chr> <chr> <dbl> <int> <dbl> <chr>
  5. ## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1
  6. ## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1
  7. knn_res %>% collect_metrics(summarise=FALSE)
  8. ## # A tibble: 2 x 6
  9. ## .metric .estimator mean n std_err .config
  10. ## <chr> <chr> <dbl> <int> <dbl> <chr>
  11. ## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1
  12. ## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1
  1. knn_res %>% collect_predictions()
  2. ## # A tibble: 550 x 7
  3. ## id .pred_children .pred_none .row .pred_class children .config
  4. ## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
  5. ## 1 Resample~ 0.0822 0.918 4 none children Preprocessor1~
  6. ## 2 Resample~ 0.0822 0.918 5 none children Preprocessor1~
  7. ## 3 Resample~ 0.155 0.845 9 none children Preprocessor1~
  8. ## 4 Resample~ 0.561 0.439 16 children children Preprocessor1~
  9. ## 5 Resample~ 0 1 22 none children Preprocessor1~
  10. ## 6 Resample~ 0.180 0.820 28 none children Preprocessor1~
  11. ## 7 Resample~ 0.738 0.262 38 children children Preprocessor1~
  12. ## 8 Resample~ 0.845 0.155 80 children children Preprocessor1~
  13. ## 9 Resample~ 0.975 0.0251 98 children children Preprocessor1~
  14. ## 10 Resample~ 0 1 101 none children Preprocessor1~
  15. ## # ... with 540 more rows
  1. rf_res <- fit_resamples(
  2. rf_spec,
  3. children ~ .,
  4. validation_split,
  5. control = control_resamples(save_pred = T)
  6. )
  7. rf_res %>% collect_metrics()
  8. ## # A tibble: 2 x 6
  9. ## .metric .estimator mean n std_err .config
  10. ## <chr> <chr> <dbl> <int> <dbl> <chr>
  11. ## 1 accuracy binary 0.687 25 0.0158 Preprocessor1_Model1
  12. ## 2 roc_auc binary 0.758 25 0.0154 Preprocessor1_Model1
  1. tree_res <- fit_resamples(
  2. tree_spec,
  3. children ~ .,
  4. validation_split,
  5. control = control_resamples(save_pred = T)
  6. )
  7. tree_res %>% collect_metrics()
  8. ## # A tibble: 2 x 6
  9. ## .metric .estimator mean n std_err .config
  10. ## <chr> <chr> <dbl> <int> <dbl> <chr>
  11. ## 1 accuracy binary 0.713 25 0.0148 Preprocessor1_Model1
  12. ## 2 roc_auc binary 0.752 25 0.0185 Preprocessor1_Model1
  1. knn_res %>%
  2. unnest(.predictions) %>%
  3. mutate(model = "kknn") %>%
  4. bind_rows(tree_res %>%
  5. unnest(.predictions) %>%
  6. mutate(model = "rpart")) %>%
  7. bind_rows(rf_res %>%
  8. unnest(.predictions) %>%
  9. mutate(model = "ranger")) %>%
  10. group_by(model) %>%
  11. roc_curve(children, .pred_children) %>%
  12. ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
  13. geom_line(size = 1.5) +
  14. geom_abline(
  15. lty = 2, alpha = 0.5,
  16. color = "gray50",
  17. size = 1.2
  18. )

tidymodels-exercise-02 - 图4

随机森林最好

  1. knn_conf <- knn_res %>% unnest(.predictions) %>%
  2. conf_mat(children, .pred_class)
  3. knn_conf
  4. ## Truth
  5. ## Prediction children none
  6. ## children 172 90
  7. ## none 103 185
  1. knn_conf %>% autoplot()

tidymodels-exercise-02 - 图5

  1. rf_fit %>% predict(new_data = test_proc, type = "prob") %>%
  2. mutate(truth = hotel_test$children) %>%
  3. roc_auc(truth, .pred_children)
  4. ## # A tibble: 1 x 3
  5. ## .metric .estimator .estimate
  6. ## <chr> <chr> <dbl>
  7. ## 1 roc_auc binary 0.798