数据探索
library(tidyverse)## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --## v ggplot2 3.3.3 v purrr 0.3.4## v tibble 3.1.1 v dplyr 1.0.6## v tidyr 1.1.3 v stringr 1.4.0## v readr 1.4.0 v forcats 0.5.1## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --## x dplyr::filter() masks stats::filter()## x dplyr::lag() masks stats::lag()ikea <- read_csv("../datasets/tidytuesday/data/2020/2020-11-03/ikea.csv")#### -- Column specification ----------------------------------------------------------------------------## cols(## X1 = col_double(),## item_id = col_double(),## name = col_character(),## category = col_character(),## price = col_double(),## old_price = col_character(),## sellable_online = col_logical(),## link = col_character(),## other_colors = col_character(),## short_description = col_character(),## designer = col_character(),## depth = col_double(),## height = col_double(),## width = col_double()## )ikea## # A tibble: 3,694 x 14## X1 item_id name category price old_price sellable_online link## <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>## 1 0 90420332 FREKVE~ Bar fur~ 265 No old p~ TRUE https://www.~## 2 1 368814 NORDVI~ Bar fur~ 995 No old p~ FALSE https://www.~## 3 2 9333523 NORDVI~ Bar fur~ 2095 No old p~ FALSE https://www.~## 4 3 80155205 STIG Bar fur~ 69 No old p~ TRUE https://www.~## 5 4 30180504 NORBERG Bar fur~ 225 No old p~ TRUE https://www.~## 6 5 10122647 INGOLF Bar fur~ 345 No old p~ TRUE https://www.~## 7 6 70404875 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~## 8 7 60155602 DALFRED Bar fur~ 195 No old p~ TRUE https://www.~## 9 8 50406465 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~## 10 9 69304221 EKEDAL~ Bar fur~ 2176 SR 2,375 TRUE https://www.~## # ... with 3,684 more rows, and 6 more variables: other_colors <chr>,## # short_description <chr>, designer <chr>, depth <dbl>, height <dbl>,## # width <dbl>
ikea %>%select(X1, price, depth:width) %>%pivot_longer(depth:width, names_to = "dim") %>%ggplot(aes(value, price, color = dim)) +geom_point(alpha = 0.4, show.legend = FALSE) +scale_y_log10() +facet_wrap(~dim, scales = "free_x") +labs(x = NULL)

ikea_df <- ikea %>%select(price, name, category, depth, height, width) %>%mutate(price = log10(price)) %>%mutate_if(is.character, factor)ikea_df## # A tibble: 3,694 x 6## price name category depth height width## <dbl> <fct> <fct> <dbl> <dbl> <dbl>## 1 2.42 FREKVENS Bar furniture NA 99 51## 2 3.00 NORDVIKEN Bar furniture NA 105 80## 3 3.32 NORDVIKEN / NORDVIKEN Bar furniture NA NA NA## 4 1.84 STIG Bar furniture 50 100 60## 5 2.35 NORBERG Bar furniture 60 43 74## 6 2.54 INGOLF Bar furniture 45 91 40## 7 2.11 FRANKLIN Bar furniture 44 95 50## 8 2.29 DALFRED Bar furniture 50 NA 50## 9 2.11 FRANKLIN Bar furniture 44 95 50## 10 3.34 EKEDALEN / EKEDALEN Bar furniture NA NA NA## # ... with 3,684 more rows
建立模型
library(tidymodels)## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --## v broom 0.7.6 v rsample 0.0.9## v dials 0.0.9 v tune 0.1.5## v infer 0.5.4 v workflows 0.2.2## v modeldata 0.1.0 v workflowsets 0.0.2## v parsnip 0.1.5 v yardstick 0.0.8## v recipes 0.1.16## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --## x scales::discard() masks purrr::discard()## x dplyr::filter() masks stats::filter()## x recipes::fixed() masks stringr::fixed()## x dplyr::lag() masks stats::lag()## x yardstick::spec() masks readr::spec()## x recipes::step() masks stats::step()## * Use tidymodels_prefer() to resolve common conflicts.ikea_df <- ikea_df[1:100,]set.seed(123)ikea_split <- initial_split(ikea_df, strata = price)ikea_train <- training(ikea_split)ikea_test <- testing(ikea_split)set.seed(234)ikea_folds <- bootstraps(ikea_train, strata = price,times = 5)ikea_folds## # Bootstrap sampling using stratification## # A tibble: 5 x 2## splits id## <list> <chr>## 1 <split [76/28]> Bootstrap1## 2 <split [76/29]> Bootstrap2## 3 <split [76/26]> Bootstrap3## 4 <split [76/27]> Bootstrap4## 5 <split [76/27]> Bootstrap5
library(usemodels)use_ranger(price ~ ., data = ikea_train)## ranger_recipe <-## recipe(formula = price ~ ., data = ikea_train)#### ranger_spec <-## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%## set_mode("regression") %>%## set_engine("ranger")#### ranger_workflow <-## workflow() %>%## add_recipe(ranger_recipe) %>%## add_model(ranger_spec)#### set.seed(31129)## ranger_tune <-## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)ranger_recipe <-recipe(formula = price ~ ., data = ikea_train) %>%step_other(name, category, threshold = 0.01) %>%#step_clean_levels(name, category) %>%step_impute_knn(depth, height, width)ranger_spec <-rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%set_mode("regression") %>%set_engine("ranger",importance = "permutation")ranger_workflow <-workflow() %>%add_recipe(ranger_recipe) %>%add_model(ranger_spec)set.seed(8577)doParallel::registerDoParallel()ranger_tune <- tune_grid(ranger_workflow,resamples = ikea_folds,grid = 5)## i Creating pre-processing data to finalize unknown parameter: mtry
ranger_tune## # Tuning results## # Bootstrap sampling using stratification## # A tibble: 5 x 4## splits id .metrics .notes## <list> <chr> <list> <list>## 1 <split [76/28]> Bootstrap1 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>## 2 <split [76/29]> Bootstrap2 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>## 3 <split [76/26]> Bootstrap3 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>## 4 <split [76/27]> Bootstrap4 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>## 5 <split [76/27]> Bootstrap5 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
结果
show_best(ranger_tune, metric = "rmse")## # A tibble: 5 x 8## mtry min_n .metric .estimator mean n std_err .config## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>## 1 5 9 rmse standard 0.329 5 0.0123 Preprocessor1_Model1## 2 4 31 rmse standard 0.331 5 0.0112 Preprocessor1_Model4## 3 3 24 rmse standard 0.332 5 0.0109 Preprocessor1_Model5## 4 2 37 rmse standard 0.333 5 0.0127 Preprocessor1_Model2## 5 3 13 rmse standard 0.334 5 0.0114 Preprocessor1_Model3
show_best(ranger_tune, metric = "rsq")## # A tibble: 5 x 8## mtry min_n .metric .estimator mean n std_err .config## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>## 1 5 9 rsq standard 0.316 5 0.0421 Preprocessor1_Model1## 2 3 13 rsq standard 0.280 5 0.0452 Preprocessor1_Model3## 3 4 31 rsq standard 0.264 5 0.0473 Preprocessor1_Model4## 4 3 24 rsq standard 0.261 5 0.0464 Preprocessor1_Model5## 5 2 37 rsq standard 0.221 5 0.0503 Preprocessor1_Model2
autoplot(ranger_tune)

把训练好的参数放进模型中
final_rf <- ranger_workflow %>%finalize_workflow(select_best(ranger_tune, metric = "rmse"))final_rf## == Workflow ========================================================================================## Preprocessor: Recipe## Model: rand_forest()#### -- Preprocessor ------------------------------------------------------------------------------------## 2 Recipe Steps#### * step_other()## * step_impute_knn()#### -- Model -------------------------------------------------------------------------------------------## Random Forest Model Specification (regression)#### Main Arguments:## mtry = 5## trees = 1000## min_n = 9#### Engine-Specific Arguments:## importance = permutation#### Computational engine: ranger
lase_fit使用训练好的参数重新作用于训练集!!并且!!同时获得测试集的效果
ikea_fit <- last_fit(final_rf, ikea_split)ikea_fit # 这个`last_fit`使用测试集得出来的## # Resampling results## # Manual resampling## # A tibble: 1 x 6## splits id .metrics .notes .predictions .workflow## <list> <chr> <list> <list> <list> <list>## 1 <split [76~ train/test~ <tibble[,4] [~ <tibble[,1]~ <tibble[,4] [24~ <workflo~
# 这2个都是基于测试集得出的结果collect_metrics(ikea_fit)## # A tibble: 2 x 4## .metric .estimator .estimate .config## <chr> <chr> <dbl> <chr>## 1 rmse standard 0.310 Preprocessor1_Model1## 2 rsq standard 0.294 Preprocessor1_Model1collect_predictions(ikea_fit)## # A tibble: 24 x 5## id .pred .row price .config## <chr> <dbl> <int> <dbl> <chr>## 1 train/test split 2.38 5 2.35 Preprocessor1_Model1## 2 train/test split 2.27 15 2.54 Preprocessor1_Model1## 3 train/test split 2.95 21 3.00 Preprocessor1_Model1## 4 train/test split 2.54 23 2.77 Preprocessor1_Model1## 5 train/test split 2.27 25 2.54 Preprocessor1_Model1## 6 train/test split 2.27 26 2.54 Preprocessor1_Model1## 7 train/test split 3.16 31 2.77 Preprocessor1_Model1## 8 train/test split 2.61 32 2.64 Preprocessor1_Model1## 9 train/test split 3.25 40 3.34 Preprocessor1_Model1## 10 train/test split 2.44 41 2.87 Preprocessor1_Model1## # ... with 14 more rows
collect_predictions(ikea_fit) %>%ggplot(aes(price, .pred)) +geom_abline(lty = 1, color = "red",lwd=2) +geom_point(alpha = 0.5, color = "midnightblue") +coord_fixed()

现在可以使用训练好的ikea_fit预测其他的数据了
predict(ikea_fit$.workflow[[1]], ikea_test[10, ])## # A tibble: 1 x 1## .pred## <dbl>## 1 2.44
使用vip包看看重要性,需要importance这个参数
library(vip)#### Attaching package: 'vip'## The following object is masked from 'package:utils':#### viimp_spec <- ranger_spec %>%finalize_model(select_best(ranger_tune)) %>%set_engine("ranger", importance = "permutation")workflow() %>%add_recipe(ranger_recipe) %>%add_model(imp_spec) %>%fit(ikea_train) %>%pull_workflow_fit() %>%vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

finalize_model,这里有一个问题,因为原数据是有缺失值的,如果不使用workflow,缺失值就不会被填补,后面fit和last_fit有可能会出错,基于树的模型是不能处理有缺失值的数据的
final_range已经使用了训练好的参数,但是还没有fit过,可以使用一下2种方法fit
现在可以预测新数据了
