数据探索
children是结果变量,是一个2分类变量。
library(tidyverse)## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --## v ggplot2 3.3.3 v purrr 0.3.4## v tibble 3.1.1 v dplyr 1.0.6## v tidyr 1.1.3 v stringr 1.4.0## v readr 1.4.0 v forcats 0.5.1## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()hotels <- readr::read_csv("../datasets/tidytuesday/data/2020/2020-02-11/hotels.csv")#### -- Column specification ----------------------------------------------------------------------------## cols(## .default = col_double(),## hotel = col_character(),## arrival_date_month = col_character(),## meal = col_character(),## country = col_character(),## market_segment = col_character(),## distribution_channel = col_character(),## reserved_room_type = col_character(),## assigned_room_type = col_character(),## deposit_type = col_character(),## agent = col_character(),## company = col_character(),## customer_type = col_character(),## reservation_status = col_character(),## reservation_status_date = col_date(format = "")## )## i<U+00A0>Use `spec()` for the full column specifications.hotel_stays <- hotels %>%filter(is_canceled == 0) %>%mutate(children = case_when( # 学习下case_when函数children + babies > 0 ~ "children",TRUE ~ "none"),required_car_parking_spaces = case_when(required_car_parking_spaces > 0 ~ "parking",TRUE ~ "none")) %>%select(-is_canceled, -reservation_status, -babies)hotel_stays## # A tibble: 75,166 x 29## hotel lead_time arrival_date_year arrival_date_mon~ arrival_date_week_n~## <chr> <dbl> <dbl> <chr> <dbl>## 1 Resort Ho~ 342 2015 July 27## 2 Resort Ho~ 737 2015 July 27## 3 Resort Ho~ 7 2015 July 27## 4 Resort Ho~ 13 2015 July 27## 5 Resort Ho~ 14 2015 July 27## 6 Resort Ho~ 14 2015 July 27## 7 Resort Ho~ 0 2015 July 27## 8 Resort Ho~ 9 2015 July 27## 9 Resort Ho~ 35 2015 July 27## 10 Resort Ho~ 68 2015 July 27## # ... with 75,156 more rows, and 24 more variables:## # arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,## # stays_in_week_nights <dbl>, adults <dbl>, children <chr>, meal <chr>,## # country <chr>, market_segment <chr>, distribution_channel <chr>,## # is_repeated_guest <dbl>, previous_cancellations <dbl>,## # previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,## # assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,## # agent <chr>, company <chr>, days_in_waiting_list <dbl>,## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <chr>,## # total_of_special_requests <dbl>, reservation_status_date <date>
hotel_stays %>% count(children) # children是结果变量## # A tibble: 2 x 2## children n## <chr> <int>## 1 children 6073## 2 none 69093
skimr::skim(hotel_stays)
Table: Data summary
| Name | hotel_stays |
| Number of rows | 75166 |
| Number of columns | 29 |
| _ | |
| Column type frequency: | |
| character | 14 |
| Date | 1 |
| numeric | 14 |
| __ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| hotel | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
| arrival_date_month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
| children | 0 | 1 | 4 | 8 | 0 | 2 | 0 |
| meal | 0 | 1 | 2 | 9 | 0 | 5 | 0 |
| country | 0 | 1 | 2 | 4 | 0 | 166 | 0 |
| market_segment | 0 | 1 | 6 | 13 | 0 | 7 | 0 |
| distribution_channel | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
| reserved_room_type | 0 | 1 | 1 | 1 | 0 | 9 | 0 |
| assigned_room_type | 0 | 1 | 1 | 1 | 0 | 10 | 0 |
| deposit_type | 0 | 1 | 10 | 10 | 0 | 3 | 0 |
| agent | 0 | 1 | 1 | 4 | 0 | 315 | 0 |
| company | 0 | 1 | 1 | 4 | 0 | 332 | 0 |
| customer_type | 0 | 1 | 5 | 15 | 0 | 4 | 0 |
| required_car_parking_spaces | 0 | 1 | 4 | 7 | 0 | 2 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| reservation_status_date | 0 | 1 | 2015-07-01 | 2017-09-14 | 2016-09-01 | 805 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| lead_time | 0 | 1 | 79.98 | 91.11 | 0.00 | 9.0 | 45.0 | 124 | 737 | ▇▂▁▁▁ |
| arrival_date_year | 0 | 1 | 2016.15 | 0.70 | 2015.00 | 2016.0 | 2016.0 | 2017 | 2017 | ▃▁▇▁▆ |
| arrival_date_week_number | 0 | 1 | 27.08 | 13.90 | 1.00 | 16.0 | 28.0 | 38 | 53 | ▆▇▇▇▆ |
| arrival_date_day_of_month | 0 | 1 | 15.84 | 8.78 | 1.00 | 8.0 | 16.0 | 23 | 31 | ▇▇▇▇▆ |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 0.99 | 0.00 | 0.0 | 1.0 | 2 | 19 | ▇▁▁▁▁ |
| stays_in_week_nights | 0 | 1 | 2.46 | 1.92 | 0.00 | 1.0 | 2.0 | 3 | 50 | ▇▁▁▁▁ |
| adults | 0 | 1 | 1.83 | 0.51 | 0.00 | 2.0 | 2.0 | 2 | 4 | ▁▂▇▁▁ |
| is_repeated_guest | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.0 | 0.0 | 0 | 1 | ▇▁▁▁▁ |
| previous_cancellations | 0 | 1 | 0.02 | 0.27 | 0.00 | 0.0 | 0.0 | 0 | 13 | ▇▁▁▁▁ |
| previous_bookings_not_canceled | 0 | 1 | 0.20 | 1.81 | 0.00 | 0.0 | 0.0 | 0 | 72 | ▇▁▁▁▁ |
| booking_changes | 0 | 1 | 0.29 | 0.74 | 0.00 | 0.0 | 0.0 | 0 | 21 | ▇▁▁▁▁ |
| days_in_waiting_list | 0 | 1 | 1.59 | 14.78 | 0.00 | 0.0 | 0.0 | 0 | 379 | ▇▁▁▁▁ |
| adr | 0 | 1 | 99.99 | 49.21 | -6.38 | 67.5 | 92.5 | 125 | 510 | ▇▆▁▁▁ |
| total_of_special_requests | 0 | 1 | 0.71 | 0.83 | 0.00 | 0.0 | 1.0 | 1 | 5 | ▇▁▁▁▁ |
# 写出这么刘畅的代码取决于对数据的理解以及对tidyverse包的理解hotel_stays %>%mutate(arrival_date_month = factor(arrival_date_month, levels = month.name)) %>%count(hotel, arrival_date_month, children) %>%group_by(hotel, children) %>%mutate(proportion = n / sum(n)) %>%ggplot(aes(arrival_date_month, proportion, fill = children)) +geom_col(position = "dodge") +scale_y_continuous(labels = scales::percent_format()) + # 学习下百分比的用法facet_wrap(~hotel, nrow = 2) +labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

hotel_stays %>%count(hotel, required_car_parking_spaces, children) %>%group_by(hotel, children) %>%mutate(proportion = n / sum(n)) %>%ggplot(aes(required_car_parking_spaces, proportion, fill = children)) +geom_col(position = "dodge") +scale_y_continuous(labels = scales::percent_format()) +facet_wrap(~hotel, nrow = 2) +labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)

library(GGally)## Registered S3 method overwritten by 'GGally':## method from## +.gg ggplot2hotel_stays %>%select(children, adr,required_car_parking_spaces,total_of_special_requests) %>%ggpairs(mapping = aes(color = children))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

建模
数据准备
hotels_df <- hotel_stays %>%select(children, hotel, arrival_date_month, meal, adr, adults,required_car_parking_spaces, total_of_special_requests,stays_in_week_nights, stays_in_weekend_nights) %>%mutate_if(is.character, factor)hotels_df <- hotels_df[1:1000,]
library(tidymodels)## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --## v broom 0.7.6 v rsample 0.0.9## v dials 0.0.9 v tune 0.1.5## v infer 0.5.4 v workflows 0.2.2## v modeldata 0.1.0 v workflowsets 0.0.2## v parsnip 0.1.5 v yardstick 0.0.8## v recipes 0.1.16## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --## x scales::discard() masks purrr::discard()## x dplyr::filter() masks stats::filter()## x recipes::fixed() masks stringr::fixed()## x dplyr::lag() masks stats::lag()## x yardstick::spec() masks readr::spec()## x recipes::step() masks stats::step()## * Use tidymodels_prefer() to resolve common conflicts.tidymodels_prefer()set.seed(12)hotel_split <- hotels_df %>%initial_split()hotel_train <- training(hotel_split)hotel_test <- testing(hotel_split)hotel_rec <- recipe(children ~ ., data = hotel_train) %>%themis::step_downsample(children) %>%step_dummy(all_nominal(), -all_outcomes()) %>%step_zv(all_numeric()) %>%step_normalize(all_numeric()) %>%prep()## Registered S3 methods overwritten by 'themis':## method from## bake.step_downsample recipes## bake.step_upsample recipes## prep.step_downsample recipes## prep.step_upsample recipes## tidy.step_downsample recipes## tidy.step_upsample recipes## tunable.step_downsample recipes## tunable.step_upsample recipeshotel_rec## Data Recipe#### Inputs:#### role #variables## outcome 1## predictor 9#### Training data contained 750 data points and no missing data.#### Operations:#### Down-sampling based on children [trained]## Dummy variables from hotel, arrival_date_month, ... [trained]## Zero variance filter removed 11 items [trained]## Centering and scaling for adr, adults, ... [trained]
# 提取准备好的训练集和测试集train_proc <- bake(hotel_rec, new_data = NULL)test_proc <- bake(hotel_rec, new_data = hotel_test)
knn
knn_spec <- nearest_neighbor(mode = "classification") %>% set_engine("kknn")knn_fit <- knn_spec %>% fit(children ~ ., data = train_proc)knn_fit## parsnip model object#### Fit time: 11ms#### Call:## kknn::train.kknn(formula = children ~ ., data = data, ks = min_rows(5, data, 5))#### Type of response variable: nominal## Minimal misclassification: 0.3482143## Best kernel: optimal## Best k: 5
随机森林
rf_spec <- rand_forest(mode = "classification") %>% set_engine("ranger")rf_fit <- rf_spec %>% fit(children ~ ., data = train_proc)rf_fit## parsnip model object#### Fit time: 80ms## Ranger result#### Call:## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)#### Type: Probability estimation## Number of trees: 500## Sample size: 224## Number of independent variables: 11## Mtry: 3## Target node size: 10## Variable importance mode: none## Splitrule: gini## OOB prediction error (Brier s.): 0.1935183
决策树
tree_spec <- decision_tree(mode = "classification") %>% set_engine("rpart")tree_fit <- tree_spec %>% fit(children ~ ., data = train_proc)tree_fit## parsnip model object#### Fit time: 0ms## n= 224#### node), split, n, loss, yval, (yprob)## * denotes terminal node#### 1) root 224 112 children (0.50000000 0.50000000)## 2) adr>=0.5847452 63 11 children (0.82539683 0.17460317) *## 3) adr< 0.5847452 161 60 none (0.37267081 0.62732919)## 6) total_of_special_requests>=0.472056 43 15 children (0.65116279 0.34883721)## 12) meal_HB>=0.5359649 13 2 children (0.84615385 0.15384615) *## 13) meal_HB< 0.5359649 30 13 children (0.56666667 0.43333333)## 26) adr< -0.266918 21 6 children (0.71428571 0.28571429) *## 27) adr>=-0.266918 9 2 none (0.22222222 0.77777778) *## 7) total_of_special_requests< 0.472056 118 32 none (0.27118644 0.72881356)## 14) adr>=-0.7217095 76 29 none (0.38157895 0.61842105)## 28) adr< -0.53929 8 2 children (0.75000000 0.25000000) *## 29) adr>=-0.53929 68 23 none (0.33823529 0.66176471)## 58) adr>=-0.3662756 57 23 none (0.40350877 0.59649123)## 116) adr< 0.2846238 38 19 children (0.50000000 0.50000000)## 232) stays_in_weekend_nights>=-0.8218309 24 9 children (0.62500000 0.37500000) *## 233) stays_in_weekend_nights< -0.8218309 14 4 none (0.28571429 0.71428571) *## 117) adr>=0.2846238 19 4 none (0.21052632 0.78947368) *## 59) adr< -0.3662756 11 0 none (0.00000000 1.00000000) *## 15) adr< -0.7217095 42 3 none (0.07142857 0.92857143) *
评价模型
set.seed(1234)# 模特卡洛法交叉验证validation_split <- mc_cv(data = train_proc, prop = 0.9, strata = children)validation_split## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification## # A tibble: 25 x 2## splits id## <list> <chr>## 1 <split [202/22]> Resample01## 2 <split [202/22]> Resample02## 3 <split [202/22]> Resample03## 4 <split [202/22]> Resample04## 5 <split [202/22]> Resample05## 6 <split [202/22]> Resample06## 7 <split [202/22]> Resample07## 8 <split [202/22]> Resample08## 9 <split [202/22]> Resample09## 10 <split [202/22]> Resample10## # ... with 15 more rowsvalidation_split$splits[[1]] %>% analysis() %>% dim()## [1] 202 12validation_split$splits[[1]] %>% assessment()## # A tibble: 22 x 12## adr adults total_of_special_~ stays_in_week_n~ stays_in_weekend~ children## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>## 1 -0.909 -0.0643 0.957 0.675 0.671 children## 2 -0.961 -2.94 0.957 0.193 0.671 children## 3 -1.16 -0.0643 0.957 -0.288 0.671 children## 4 0.837 2.82 -0.0130 0.193 -1.32 children## 5 -0.698 -0.0643 -0.983 1.64 2.66 children## 6 0.837 -0.0643 -0.0130 -1.73 0.671 children## 7 -0.232 -0.0643 -0.0130 -0.770 -1.32 children## 8 1.23 -0.0643 -0.983 0.675 2.66 children## 9 1.09 -0.0643 -0.983 2.12 0.671 children## 10 0.613 -0.0643 0.957 -0.770 -1.32 children## # ... with 12 more rows, and 6 more variables: arrival_date_month_August <dbl>,## # arrival_date_month_July <dbl>, arrival_date_month_September <dbl>,## # meal_FB <dbl>, meal_HB <dbl>, required_car_parking_spaces_parking <dbl>
knn_res <- fit_resamples(knn_spec,children ~ .,validation_split,control = control_resamples(save_pred = T))knn_res## # Resampling results## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification## # A tibble: 25 x 5## splits id .metrics .notes .predictions## <list> <chr> <list> <list> <list>## 1 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 2 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 3 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 4 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 5 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 6 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 7 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 8 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 9 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## 10 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~## # ... with 15 more rows
knn_res %>% collect_metrics()## # A tibble: 2 x 6## .metric .estimator mean n std_err .config## <chr> <chr> <dbl> <int> <dbl> <chr>## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1knn_res %>% collect_metrics(summarise=FALSE)## # A tibble: 2 x 6## .metric .estimator mean n std_err .config## <chr> <chr> <dbl> <int> <dbl> <chr>## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1
knn_res %>% collect_predictions()## # A tibble: 550 x 7## id .pred_children .pred_none .row .pred_class children .config## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>## 1 Resample~ 0.0822 0.918 4 none children Preprocessor1~## 2 Resample~ 0.0822 0.918 5 none children Preprocessor1~## 3 Resample~ 0.155 0.845 9 none children Preprocessor1~## 4 Resample~ 0.561 0.439 16 children children Preprocessor1~## 5 Resample~ 0 1 22 none children Preprocessor1~## 6 Resample~ 0.180 0.820 28 none children Preprocessor1~## 7 Resample~ 0.738 0.262 38 children children Preprocessor1~## 8 Resample~ 0.845 0.155 80 children children Preprocessor1~## 9 Resample~ 0.975 0.0251 98 children children Preprocessor1~## 10 Resample~ 0 1 101 none children Preprocessor1~## # ... with 540 more rows
rf_res <- fit_resamples(rf_spec,children ~ .,validation_split,control = control_resamples(save_pred = T))rf_res %>% collect_metrics()## # A tibble: 2 x 6## .metric .estimator mean n std_err .config## <chr> <chr> <dbl> <int> <dbl> <chr>## 1 accuracy binary 0.687 25 0.0158 Preprocessor1_Model1## 2 roc_auc binary 0.758 25 0.0154 Preprocessor1_Model1
tree_res <- fit_resamples(tree_spec,children ~ .,validation_split,control = control_resamples(save_pred = T))tree_res %>% collect_metrics()## # A tibble: 2 x 6## .metric .estimator mean n std_err .config## <chr> <chr> <dbl> <int> <dbl> <chr>## 1 accuracy binary 0.713 25 0.0148 Preprocessor1_Model1## 2 roc_auc binary 0.752 25 0.0185 Preprocessor1_Model1
knn_res %>%unnest(.predictions) %>%mutate(model = "kknn") %>%bind_rows(tree_res %>%unnest(.predictions) %>%mutate(model = "rpart")) %>%bind_rows(rf_res %>%unnest(.predictions) %>%mutate(model = "ranger")) %>%group_by(model) %>%roc_curve(children, .pred_children) %>%ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +geom_line(size = 1.5) +geom_abline(lty = 2, alpha = 0.5,color = "gray50",size = 1.2)

随机森林最好
knn_conf <- knn_res %>% unnest(.predictions) %>%conf_mat(children, .pred_class)knn_conf## Truth## Prediction children none## children 172 90## none 103 185
knn_conf %>% autoplot()

rf_fit %>% predict(new_data = test_proc, type = "prob") %>%mutate(truth = hotel_test$children) %>%roc_auc(truth, .pred_children)## # A tibble: 1 x 3## .metric .estimator .estimate## <chr> <chr> <dbl>## 1 roc_auc binary 0.798
