数据探索
children是结果变量,是一个2分类变量。
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
hotels <- readr::read_csv("../datasets/tidytuesday/data/2020/2020-02-11/hotels.csv")
##
## -- Column specification ----------------------------------------------------------------------------
## cols(
## .default = col_double(),
## hotel = col_character(),
## arrival_date_month = col_character(),
## meal = col_character(),
## country = col_character(),
## market_segment = col_character(),
## distribution_channel = col_character(),
## reserved_room_type = col_character(),
## assigned_room_type = col_character(),
## deposit_type = col_character(),
## agent = col_character(),
## company = col_character(),
## customer_type = col_character(),
## reservation_status = col_character(),
## reservation_status_date = col_date(format = "")
## )
## i<U+00A0>Use `spec()` for the full column specifications.
hotel_stays <- hotels %>%
filter(is_canceled == 0) %>%
mutate(
children = case_when( # 学习下case_when函数
children + babies > 0 ~ "children",
TRUE ~ "none"
),
required_car_parking_spaces = case_when(
required_car_parking_spaces > 0 ~ "parking",
TRUE ~ "none"
)
) %>%
select(-is_canceled, -reservation_status, -babies)
hotel_stays
## # A tibble: 75,166 x 29
## hotel lead_time arrival_date_year arrival_date_mon~ arrival_date_week_n~
## <chr> <dbl> <dbl> <chr> <dbl>
## 1 Resort Ho~ 342 2015 July 27
## 2 Resort Ho~ 737 2015 July 27
## 3 Resort Ho~ 7 2015 July 27
## 4 Resort Ho~ 13 2015 July 27
## 5 Resort Ho~ 14 2015 July 27
## 6 Resort Ho~ 14 2015 July 27
## 7 Resort Ho~ 0 2015 July 27
## 8 Resort Ho~ 9 2015 July 27
## 9 Resort Ho~ 35 2015 July 27
## 10 Resort Ho~ 68 2015 July 27
## # ... with 75,156 more rows, and 24 more variables:
## # arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
## # stays_in_week_nights <dbl>, adults <dbl>, children <chr>, meal <chr>,
## # country <chr>, market_segment <chr>, distribution_channel <chr>,
## # is_repeated_guest <dbl>, previous_cancellations <dbl>,
## # previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
## # assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
## # agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <chr>,
## # total_of_special_requests <dbl>, reservation_status_date <date>
hotel_stays %>% count(children) # children是结果变量
## # A tibble: 2 x 2
## children n
## <chr> <int>
## 1 children 6073
## 2 none 69093
skimr::skim(hotel_stays)
Table: Data summary
Name | hotel_stays |
Number of rows | 75166 |
Number of columns | 29 |
_ | |
Column type frequency: | |
character | 14 |
Date | 1 |
numeric | 14 |
__ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
hotel | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
arrival_date_month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
children | 0 | 1 | 4 | 8 | 0 | 2 | 0 |
meal | 0 | 1 | 2 | 9 | 0 | 5 | 0 |
country | 0 | 1 | 2 | 4 | 0 | 166 | 0 |
market_segment | 0 | 1 | 6 | 13 | 0 | 7 | 0 |
distribution_channel | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
reserved_room_type | 0 | 1 | 1 | 1 | 0 | 9 | 0 |
assigned_room_type | 0 | 1 | 1 | 1 | 0 | 10 | 0 |
deposit_type | 0 | 1 | 10 | 10 | 0 | 3 | 0 |
agent | 0 | 1 | 1 | 4 | 0 | 315 | 0 |
company | 0 | 1 | 1 | 4 | 0 | 332 | 0 |
customer_type | 0 | 1 | 5 | 15 | 0 | 4 | 0 |
required_car_parking_spaces | 0 | 1 | 4 | 7 | 0 | 2 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
reservation_status_date | 0 | 1 | 2015-07-01 | 2017-09-14 | 2016-09-01 | 805 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
lead_time | 0 | 1 | 79.98 | 91.11 | 0.00 | 9.0 | 45.0 | 124 | 737 | ▇▂▁▁▁ |
arrival_date_year | 0 | 1 | 2016.15 | 0.70 | 2015.00 | 2016.0 | 2016.0 | 2017 | 2017 | ▃▁▇▁▆ |
arrival_date_week_number | 0 | 1 | 27.08 | 13.90 | 1.00 | 16.0 | 28.0 | 38 | 53 | ▆▇▇▇▆ |
arrival_date_day_of_month | 0 | 1 | 15.84 | 8.78 | 1.00 | 8.0 | 16.0 | 23 | 31 | ▇▇▇▇▆ |
stays_in_weekend_nights | 0 | 1 | 0.93 | 0.99 | 0.00 | 0.0 | 1.0 | 2 | 19 | ▇▁▁▁▁ |
stays_in_week_nights | 0 | 1 | 2.46 | 1.92 | 0.00 | 1.0 | 2.0 | 3 | 50 | ▇▁▁▁▁ |
adults | 0 | 1 | 1.83 | 0.51 | 0.00 | 2.0 | 2.0 | 2 | 4 | ▁▂▇▁▁ |
is_repeated_guest | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.0 | 0.0 | 0 | 1 | ▇▁▁▁▁ |
previous_cancellations | 0 | 1 | 0.02 | 0.27 | 0.00 | 0.0 | 0.0 | 0 | 13 | ▇▁▁▁▁ |
previous_bookings_not_canceled | 0 | 1 | 0.20 | 1.81 | 0.00 | 0.0 | 0.0 | 0 | 72 | ▇▁▁▁▁ |
booking_changes | 0 | 1 | 0.29 | 0.74 | 0.00 | 0.0 | 0.0 | 0 | 21 | ▇▁▁▁▁ |
days_in_waiting_list | 0 | 1 | 1.59 | 14.78 | 0.00 | 0.0 | 0.0 | 0 | 379 | ▇▁▁▁▁ |
adr | 0 | 1 | 99.99 | 49.21 | -6.38 | 67.5 | 92.5 | 125 | 510 | ▇▆▁▁▁ |
total_of_special_requests | 0 | 1 | 0.71 | 0.83 | 0.00 | 0.0 | 1.0 | 1 | 5 | ▇▁▁▁▁ |
# 写出这么刘畅的代码取决于对数据的理解以及对tidyverse包的理解
hotel_stays %>%
mutate(arrival_date_month = factor(arrival_date_month, levels = month.name)) %>%
count(hotel, arrival_date_month, children) %>%
group_by(hotel, children) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(arrival_date_month, proportion, fill = children)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) + # 学习下百分比的用法
facet_wrap(~hotel, nrow = 2) +
labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)
hotel_stays %>%
count(hotel, required_car_parking_spaces, children) %>%
group_by(hotel, children) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(required_car_parking_spaces, proportion, fill = children)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
facet_wrap(~hotel, nrow = 2) +
labs(x = NULL, y = "Proportion of hotel stays", fill = NULL)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
hotel_stays %>%
select(
children, adr,
required_car_parking_spaces,
total_of_special_requests
) %>%
ggpairs(mapping = aes(color = children))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
建模
数据准备
hotels_df <- hotel_stays %>%
select(
children, hotel, arrival_date_month, meal, adr, adults,
required_car_parking_spaces, total_of_special_requests,
stays_in_week_nights, stays_in_weekend_nights
) %>%
mutate_if(is.character, factor)
hotels_df <- hotels_df[1:1000,]
library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
## v broom 0.7.6 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.5
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v workflowsets 0.0.2
## v parsnip 0.1.5 v yardstick 0.0.8
## v recipes 0.1.16
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
tidymodels_prefer()
set.seed(12)
hotel_split <- hotels_df %>%
initial_split()
hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)
hotel_rec <- recipe(children ~ ., data = hotel_train) %>%
themis::step_downsample(children) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_numeric()) %>%
step_normalize(all_numeric()) %>%
prep()
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
hotel_rec
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 9
##
## Training data contained 750 data points and no missing data.
##
## Operations:
##
## Down-sampling based on children [trained]
## Dummy variables from hotel, arrival_date_month, ... [trained]
## Zero variance filter removed 11 items [trained]
## Centering and scaling for adr, adults, ... [trained]
# 提取准备好的训练集和测试集
train_proc <- bake(hotel_rec, new_data = NULL)
test_proc <- bake(hotel_rec, new_data = hotel_test)
knn
knn_spec <- nearest_neighbor(mode = "classification") %>% set_engine("kknn")
knn_fit <- knn_spec %>% fit(children ~ ., data = train_proc)
knn_fit
## parsnip model object
##
## Fit time: 11ms
##
## Call:
## kknn::train.kknn(formula = children ~ ., data = data, ks = min_rows(5, data, 5))
##
## Type of response variable: nominal
## Minimal misclassification: 0.3482143
## Best kernel: optimal
## Best k: 5
随机森林
rf_spec <- rand_forest(mode = "classification") %>% set_engine("ranger")
rf_fit <- rf_spec %>% fit(children ~ ., data = train_proc)
rf_fit
## parsnip model object
##
## Fit time: 80ms
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 224
## Number of independent variables: 11
## Mtry: 3
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1935183
决策树
tree_spec <- decision_tree(mode = "classification") %>% set_engine("rpart")
tree_fit <- tree_spec %>% fit(children ~ ., data = train_proc)
tree_fit
## parsnip model object
##
## Fit time: 0ms
## n= 224
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 224 112 children (0.50000000 0.50000000)
## 2) adr>=0.5847452 63 11 children (0.82539683 0.17460317) *
## 3) adr< 0.5847452 161 60 none (0.37267081 0.62732919)
## 6) total_of_special_requests>=0.472056 43 15 children (0.65116279 0.34883721)
## 12) meal_HB>=0.5359649 13 2 children (0.84615385 0.15384615) *
## 13) meal_HB< 0.5359649 30 13 children (0.56666667 0.43333333)
## 26) adr< -0.266918 21 6 children (0.71428571 0.28571429) *
## 27) adr>=-0.266918 9 2 none (0.22222222 0.77777778) *
## 7) total_of_special_requests< 0.472056 118 32 none (0.27118644 0.72881356)
## 14) adr>=-0.7217095 76 29 none (0.38157895 0.61842105)
## 28) adr< -0.53929 8 2 children (0.75000000 0.25000000) *
## 29) adr>=-0.53929 68 23 none (0.33823529 0.66176471)
## 58) adr>=-0.3662756 57 23 none (0.40350877 0.59649123)
## 116) adr< 0.2846238 38 19 children (0.50000000 0.50000000)
## 232) stays_in_weekend_nights>=-0.8218309 24 9 children (0.62500000 0.37500000) *
## 233) stays_in_weekend_nights< -0.8218309 14 4 none (0.28571429 0.71428571) *
## 117) adr>=0.2846238 19 4 none (0.21052632 0.78947368) *
## 59) adr< -0.3662756 11 0 none (0.00000000 1.00000000) *
## 15) adr< -0.7217095 42 3 none (0.07142857 0.92857143) *
评价模型
set.seed(1234)
# 模特卡洛法交叉验证
validation_split <- mc_cv(data = train_proc, prop = 0.9, strata = children)
validation_split
## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [202/22]> Resample01
## 2 <split [202/22]> Resample02
## 3 <split [202/22]> Resample03
## 4 <split [202/22]> Resample04
## 5 <split [202/22]> Resample05
## 6 <split [202/22]> Resample06
## 7 <split [202/22]> Resample07
## 8 <split [202/22]> Resample08
## 9 <split [202/22]> Resample09
## 10 <split [202/22]> Resample10
## # ... with 15 more rows
validation_split$splits[[1]] %>% analysis() %>% dim()
## [1] 202 12
validation_split$splits[[1]] %>% assessment()
## # A tibble: 22 x 12
## adr adults total_of_special_~ stays_in_week_n~ stays_in_weekend~ children
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 -0.909 -0.0643 0.957 0.675 0.671 children
## 2 -0.961 -2.94 0.957 0.193 0.671 children
## 3 -1.16 -0.0643 0.957 -0.288 0.671 children
## 4 0.837 2.82 -0.0130 0.193 -1.32 children
## 5 -0.698 -0.0643 -0.983 1.64 2.66 children
## 6 0.837 -0.0643 -0.0130 -1.73 0.671 children
## 7 -0.232 -0.0643 -0.0130 -0.770 -1.32 children
## 8 1.23 -0.0643 -0.983 0.675 2.66 children
## 9 1.09 -0.0643 -0.983 2.12 0.671 children
## 10 0.613 -0.0643 0.957 -0.770 -1.32 children
## # ... with 12 more rows, and 6 more variables: arrival_date_month_August <dbl>,
## # arrival_date_month_July <dbl>, arrival_date_month_September <dbl>,
## # meal_FB <dbl>, meal_HB <dbl>, required_car_parking_spaces_parking <dbl>
knn_res <- fit_resamples(
knn_spec,
children ~ .,
validation_split,
control = control_resamples(save_pred = T)
)
knn_res
## # Resampling results
## # Monte Carlo cross-validation (0.9/0.1) with 25 resamples using stratification
## # A tibble: 25 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 2 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 3 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 4 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 5 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 6 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 7 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 8 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 9 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## 10 <split [202/2~ Resample~ <tibble[,4] [2 ~ <tibble[,1] [0 ~ <tibble[,6] [22 x~
## # ... with 15 more rows
knn_res %>% collect_metrics()
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1
## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1
knn_res %>% collect_metrics(summarise=FALSE)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.649 25 0.0230 Preprocessor1_Model1
## 2 roc_auc binary 0.646 25 0.0270 Preprocessor1_Model1
knn_res %>% collect_predictions()
## # A tibble: 550 x 7
## id .pred_children .pred_none .row .pred_class children .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 Resample~ 0.0822 0.918 4 none children Preprocessor1~
## 2 Resample~ 0.0822 0.918 5 none children Preprocessor1~
## 3 Resample~ 0.155 0.845 9 none children Preprocessor1~
## 4 Resample~ 0.561 0.439 16 children children Preprocessor1~
## 5 Resample~ 0 1 22 none children Preprocessor1~
## 6 Resample~ 0.180 0.820 28 none children Preprocessor1~
## 7 Resample~ 0.738 0.262 38 children children Preprocessor1~
## 8 Resample~ 0.845 0.155 80 children children Preprocessor1~
## 9 Resample~ 0.975 0.0251 98 children children Preprocessor1~
## 10 Resample~ 0 1 101 none children Preprocessor1~
## # ... with 540 more rows
rf_res <- fit_resamples(
rf_spec,
children ~ .,
validation_split,
control = control_resamples(save_pred = T)
)
rf_res %>% collect_metrics()
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.687 25 0.0158 Preprocessor1_Model1
## 2 roc_auc binary 0.758 25 0.0154 Preprocessor1_Model1
tree_res <- fit_resamples(
tree_spec,
children ~ .,
validation_split,
control = control_resamples(save_pred = T)
)
tree_res %>% collect_metrics()
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.713 25 0.0148 Preprocessor1_Model1
## 2 roc_auc binary 0.752 25 0.0185 Preprocessor1_Model1
knn_res %>%
unnest(.predictions) %>%
mutate(model = "kknn") %>%
bind_rows(tree_res %>%
unnest(.predictions) %>%
mutate(model = "rpart")) %>%
bind_rows(rf_res %>%
unnest(.predictions) %>%
mutate(model = "ranger")) %>%
group_by(model) %>%
roc_curve(children, .pred_children) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_line(size = 1.5) +
geom_abline(
lty = 2, alpha = 0.5,
color = "gray50",
size = 1.2
)
随机森林最好
knn_conf <- knn_res %>% unnest(.predictions) %>%
conf_mat(children, .pred_class)
knn_conf
## Truth
## Prediction children none
## children 172 90
## none 103 185
knn_conf %>% autoplot()
rf_fit %>% predict(new_data = test_proc, type = "prob") %>%
mutate(truth = hotel_test$children) %>%
roc_auc(truth, .pred_children)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.798