数据探索
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ikea <- read_csv("../datasets/tidytuesday/data/2020/2020-11-03/ikea.csv")
##
## -- Column specification ----------------------------------------------------------------------------
## cols(
## X1 = col_double(),
## item_id = col_double(),
## name = col_character(),
## category = col_character(),
## price = col_double(),
## old_price = col_character(),
## sellable_online = col_logical(),
## link = col_character(),
## other_colors = col_character(),
## short_description = col_character(),
## designer = col_character(),
## depth = col_double(),
## height = col_double(),
## width = col_double()
## )
ikea
## # A tibble: 3,694 x 14
## X1 item_id name category price old_price sellable_online link
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
## 1 0 90420332 FREKVE~ Bar fur~ 265 No old p~ TRUE https://www.~
## 2 1 368814 NORDVI~ Bar fur~ 995 No old p~ FALSE https://www.~
## 3 2 9333523 NORDVI~ Bar fur~ 2095 No old p~ FALSE https://www.~
## 4 3 80155205 STIG Bar fur~ 69 No old p~ TRUE https://www.~
## 5 4 30180504 NORBERG Bar fur~ 225 No old p~ TRUE https://www.~
## 6 5 10122647 INGOLF Bar fur~ 345 No old p~ TRUE https://www.~
## 7 6 70404875 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~
## 8 7 60155602 DALFRED Bar fur~ 195 No old p~ TRUE https://www.~
## 9 8 50406465 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~
## 10 9 69304221 EKEDAL~ Bar fur~ 2176 SR 2,375 TRUE https://www.~
## # ... with 3,684 more rows, and 6 more variables: other_colors <chr>,
## # short_description <chr>, designer <chr>, depth <dbl>, height <dbl>,
## # width <dbl>
ikea %>%
select(X1, price, depth:width) %>%
pivot_longer(depth:width, names_to = "dim") %>%
ggplot(aes(value, price, color = dim)) +
geom_point(alpha = 0.4, show.legend = FALSE) +
scale_y_log10() +
facet_wrap(~dim, scales = "free_x") +
labs(x = NULL)
ikea_df <- ikea %>%
select(price, name, category, depth, height, width) %>%
mutate(price = log10(price)) %>%
mutate_if(is.character, factor)
ikea_df
## # A tibble: 3,694 x 6
## price name category depth height width
## <dbl> <fct> <fct> <dbl> <dbl> <dbl>
## 1 2.42 FREKVENS Bar furniture NA 99 51
## 2 3.00 NORDVIKEN Bar furniture NA 105 80
## 3 3.32 NORDVIKEN / NORDVIKEN Bar furniture NA NA NA
## 4 1.84 STIG Bar furniture 50 100 60
## 5 2.35 NORBERG Bar furniture 60 43 74
## 6 2.54 INGOLF Bar furniture 45 91 40
## 7 2.11 FRANKLIN Bar furniture 44 95 50
## 8 2.29 DALFRED Bar furniture 50 NA 50
## 9 2.11 FRANKLIN Bar furniture 44 95 50
## 10 3.34 EKEDALEN / EKEDALEN Bar furniture NA NA NA
## # ... with 3,684 more rows
建立模型
library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
## v broom 0.7.6 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.5
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v workflowsets 0.0.2
## v parsnip 0.1.5 v yardstick 0.0.8
## v recipes 0.1.16
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
ikea_df <- ikea_df[1:100,]
set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)
set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price,times = 5)
ikea_folds
## # Bootstrap sampling using stratification
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [76/28]> Bootstrap1
## 2 <split [76/29]> Bootstrap2
## 3 <split [76/26]> Bootstrap3
## 4 <split [76/27]> Bootstrap4
## 5 <split [76/27]> Bootstrap5
library(usemodels)
use_ranger(price ~ ., data = ikea_train)
## ranger_recipe <-
## recipe(formula = price ~ ., data = ikea_train)
##
## ranger_spec <-
## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
## set_mode("regression") %>%
## set_engine("ranger")
##
## ranger_workflow <-
## workflow() %>%
## add_recipe(ranger_recipe) %>%
## add_model(ranger_spec)
##
## set.seed(31129)
## ranger_tune <-
## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)
ranger_recipe <-
recipe(formula = price ~ ., data = ikea_train) %>%
step_other(name, category, threshold = 0.01) %>%
#step_clean_levels(name, category) %>%
step_impute_knn(depth, height, width)
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("regression") %>%
set_engine("ranger",importance = "permutation")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <- tune_grid(
ranger_workflow,
resamples = ikea_folds,
grid = 5
)
## i Creating pre-processing data to finalize unknown parameter: mtry
ranger_tune
## # Tuning results
## # Bootstrap sampling using stratification
## # A tibble: 5 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [76/28]> Bootstrap1 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 2 <split [76/29]> Bootstrap2 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 3 <split [76/26]> Bootstrap3 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 4 <split [76/27]> Bootstrap4 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
## 5 <split [76/27]> Bootstrap5 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
结果
show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 5 9 rmse standard 0.329 5 0.0123 Preprocessor1_Model1
## 2 4 31 rmse standard 0.331 5 0.0112 Preprocessor1_Model4
## 3 3 24 rmse standard 0.332 5 0.0109 Preprocessor1_Model5
## 4 2 37 rmse standard 0.333 5 0.0127 Preprocessor1_Model2
## 5 3 13 rmse standard 0.334 5 0.0114 Preprocessor1_Model3
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 5 9 rsq standard 0.316 5 0.0421 Preprocessor1_Model1
## 2 3 13 rsq standard 0.280 5 0.0452 Preprocessor1_Model3
## 3 4 31 rsq standard 0.264 5 0.0473 Preprocessor1_Model4
## 4 3 24 rsq standard 0.261 5 0.0464 Preprocessor1_Model5
## 5 2 37 rsq standard 0.221 5 0.0503 Preprocessor1_Model2
autoplot(ranger_tune)
把训练好的参数放进模型中
final_rf <- ranger_workflow %>%
finalize_workflow(select_best(ranger_tune, metric = "rmse"))
final_rf
## == Workflow ========================================================================================
## Preprocessor: Recipe
## Model: rand_forest()
##
## -- Preprocessor ------------------------------------------------------------------------------------
## 2 Recipe Steps
##
## * step_other()
## * step_impute_knn()
##
## -- Model -------------------------------------------------------------------------------------------
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 5
## trees = 1000
## min_n = 9
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: ranger
lase_fit
使用训练好的参数重新作用于训练集!!并且!!同时获得测试集的效果
ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit # 这个`last_fit`使用测试集得出来的
## # Resampling results
## # Manual resampling
## # A tibble: 1 x 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [76~ train/test~ <tibble[,4] [~ <tibble[,1]~ <tibble[,4] [24~ <workflo~
# 这2个都是基于测试集得出的结果
collect_metrics(ikea_fit)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.310 Preprocessor1_Model1
## 2 rsq standard 0.294 Preprocessor1_Model1
collect_predictions(ikea_fit)
## # A tibble: 24 x 5
## id .pred .row price .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 2.38 5 2.35 Preprocessor1_Model1
## 2 train/test split 2.27 15 2.54 Preprocessor1_Model1
## 3 train/test split 2.95 21 3.00 Preprocessor1_Model1
## 4 train/test split 2.54 23 2.77 Preprocessor1_Model1
## 5 train/test split 2.27 25 2.54 Preprocessor1_Model1
## 6 train/test split 2.27 26 2.54 Preprocessor1_Model1
## 7 train/test split 3.16 31 2.77 Preprocessor1_Model1
## 8 train/test split 2.61 32 2.64 Preprocessor1_Model1
## 9 train/test split 3.25 40 3.34 Preprocessor1_Model1
## 10 train/test split 2.44 41 2.87 Preprocessor1_Model1
## # ... with 14 more rows
collect_predictions(ikea_fit) %>%
ggplot(aes(price, .pred)) +
geom_abline(lty = 1, color = "red",lwd=2) +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()
现在可以使用训练好的ikea_fit
预测其他的数据了
predict(ikea_fit$.workflow[[1]], ikea_test[10, ])
## # A tibble: 1 x 1
## .pred
## <dbl>
## 1 2.44
使用vip
包看看重要性,需要importance
这个参数
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_spec <- ranger_spec %>%
finalize_model(select_best(ranger_tune)) %>%
set_engine("ranger", importance = "permutation")
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(imp_spec) %>%
fit(ikea_train) %>%
pull_workflow_fit() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
finalize_model
,这里有一个问题,因为原数据是有缺失值的,如果不使用workflow,缺失值就不会被填补,后面fit
和last_fit
有可能会出错,基于树的模型是不能处理有缺失值的数据的
final_range
已经使用了训练好的参数,但是还没有fit
过,可以使用一下2种方法fit
现在可以预测新数据了