数据探索

  1. library(tidyverse)
  2. ## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.1 --
  3. ## v ggplot2 3.3.3 v purrr 0.3.4
  4. ## v tibble 3.1.1 v dplyr 1.0.6
  5. ## v tidyr 1.1.3 v stringr 1.4.0
  6. ## v readr 1.4.0 v forcats 0.5.1
  7. ## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
  8. ## x dplyr::filter() masks stats::filter()
  9. ## x dplyr::lag() masks stats::lag()
  10. ikea <- read_csv("../datasets/tidytuesday/data/2020/2020-11-03/ikea.csv")
  11. ##
  12. ## -- Column specification ----------------------------------------------------------------------------
  13. ## cols(
  14. ## X1 = col_double(),
  15. ## item_id = col_double(),
  16. ## name = col_character(),
  17. ## category = col_character(),
  18. ## price = col_double(),
  19. ## old_price = col_character(),
  20. ## sellable_online = col_logical(),
  21. ## link = col_character(),
  22. ## other_colors = col_character(),
  23. ## short_description = col_character(),
  24. ## designer = col_character(),
  25. ## depth = col_double(),
  26. ## height = col_double(),
  27. ## width = col_double()
  28. ## )
  29. ikea
  30. ## # A tibble: 3,694 x 14
  31. ## X1 item_id name category price old_price sellable_online link
  32. ## <dbl> <dbl> <chr> <chr> <dbl> <chr> <lgl> <chr>
  33. ## 1 0 90420332 FREKVE~ Bar fur~ 265 No old p~ TRUE https://www.~
  34. ## 2 1 368814 NORDVI~ Bar fur~ 995 No old p~ FALSE https://www.~
  35. ## 3 2 9333523 NORDVI~ Bar fur~ 2095 No old p~ FALSE https://www.~
  36. ## 4 3 80155205 STIG Bar fur~ 69 No old p~ TRUE https://www.~
  37. ## 5 4 30180504 NORBERG Bar fur~ 225 No old p~ TRUE https://www.~
  38. ## 6 5 10122647 INGOLF Bar fur~ 345 No old p~ TRUE https://www.~
  39. ## 7 6 70404875 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~
  40. ## 8 7 60155602 DALFRED Bar fur~ 195 No old p~ TRUE https://www.~
  41. ## 9 8 50406465 FRANKL~ Bar fur~ 129 No old p~ TRUE https://www.~
  42. ## 10 9 69304221 EKEDAL~ Bar fur~ 2176 SR 2,375 TRUE https://www.~
  43. ## # ... with 3,684 more rows, and 6 more variables: other_colors <chr>,
  44. ## # short_description <chr>, designer <chr>, depth <dbl>, height <dbl>,
  45. ## # width <dbl>
  1. ikea %>%
  2. select(X1, price, depth:width) %>%
  3. pivot_longer(depth:width, names_to = "dim") %>%
  4. ggplot(aes(value, price, color = dim)) +
  5. geom_point(alpha = 0.4, show.legend = FALSE) +
  6. scale_y_log10() +
  7. facet_wrap(~dim, scales = "free_x") +
  8. labs(x = NULL)

tidymodels-exercise-06 - 图1

  1. ikea_df <- ikea %>%
  2. select(price, name, category, depth, height, width) %>%
  3. mutate(price = log10(price)) %>%
  4. mutate_if(is.character, factor)
  5. ikea_df
  6. ## # A tibble: 3,694 x 6
  7. ## price name category depth height width
  8. ## <dbl> <fct> <fct> <dbl> <dbl> <dbl>
  9. ## 1 2.42 FREKVENS Bar furniture NA 99 51
  10. ## 2 3.00 NORDVIKEN Bar furniture NA 105 80
  11. ## 3 3.32 NORDVIKEN / NORDVIKEN Bar furniture NA NA NA
  12. ## 4 1.84 STIG Bar furniture 50 100 60
  13. ## 5 2.35 NORBERG Bar furniture 60 43 74
  14. ## 6 2.54 INGOLF Bar furniture 45 91 40
  15. ## 7 2.11 FRANKLIN Bar furniture 44 95 50
  16. ## 8 2.29 DALFRED Bar furniture 50 NA 50
  17. ## 9 2.11 FRANKLIN Bar furniture 44 95 50
  18. ## 10 3.34 EKEDALEN / EKEDALEN Bar furniture NA NA NA
  19. ## # ... with 3,684 more rows

建立模型

  1. library(tidymodels)
  2. ## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.3 --
  3. ## v broom 0.7.6 v rsample 0.0.9
  4. ## v dials 0.0.9 v tune 0.1.5
  5. ## v infer 0.5.4 v workflows 0.2.2
  6. ## v modeldata 0.1.0 v workflowsets 0.0.2
  7. ## v parsnip 0.1.5 v yardstick 0.0.8
  8. ## v recipes 0.1.16
  9. ## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
  10. ## x scales::discard() masks purrr::discard()
  11. ## x dplyr::filter() masks stats::filter()
  12. ## x recipes::fixed() masks stringr::fixed()
  13. ## x dplyr::lag() masks stats::lag()
  14. ## x yardstick::spec() masks readr::spec()
  15. ## x recipes::step() masks stats::step()
  16. ## * Use tidymodels_prefer() to resolve common conflicts.
  17. ikea_df <- ikea_df[1:100,]
  18. set.seed(123)
  19. ikea_split <- initial_split(ikea_df, strata = price)
  20. ikea_train <- training(ikea_split)
  21. ikea_test <- testing(ikea_split)
  22. set.seed(234)
  23. ikea_folds <- bootstraps(ikea_train, strata = price,times = 5)
  24. ikea_folds
  25. ## # Bootstrap sampling using stratification
  26. ## # A tibble: 5 x 2
  27. ## splits id
  28. ## <list> <chr>
  29. ## 1 <split [76/28]> Bootstrap1
  30. ## 2 <split [76/29]> Bootstrap2
  31. ## 3 <split [76/26]> Bootstrap3
  32. ## 4 <split [76/27]> Bootstrap4
  33. ## 5 <split [76/27]> Bootstrap5
  1. library(usemodels)
  2. use_ranger(price ~ ., data = ikea_train)
  3. ## ranger_recipe <-
  4. ## recipe(formula = price ~ ., data = ikea_train)
  5. ##
  6. ## ranger_spec <-
  7. ## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  8. ## set_mode("regression") %>%
  9. ## set_engine("ranger")
  10. ##
  11. ## ranger_workflow <-
  12. ## workflow() %>%
  13. ## add_recipe(ranger_recipe) %>%
  14. ## add_model(ranger_spec)
  15. ##
  16. ## set.seed(31129)
  17. ## ranger_tune <-
  18. ## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
  1. library(textrecipes)
  2. ranger_recipe <-
  3. recipe(formula = price ~ ., data = ikea_train) %>%
  4. step_other(name, category, threshold = 0.01) %>%
  5. #step_clean_levels(name, category) %>%
  6. step_impute_knn(depth, height, width)
  7. ranger_spec <-
  8. rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  9. set_mode("regression") %>%
  10. set_engine("ranger",importance = "permutation")
  11. ranger_workflow <-
  12. workflow() %>%
  13. add_recipe(ranger_recipe) %>%
  14. add_model(ranger_spec)
  15. set.seed(8577)
  16. doParallel::registerDoParallel()
  17. ranger_tune <- tune_grid(
  18. ranger_workflow,
  19. resamples = ikea_folds,
  20. grid = 5
  21. )
  22. ## i Creating pre-processing data to finalize unknown parameter: mtry
  1. ranger_tune
  2. ## # Tuning results
  3. ## # Bootstrap sampling using stratification
  4. ## # A tibble: 5 x 4
  5. ## splits id .metrics .notes
  6. ## <list> <chr> <list> <list>
  7. ## 1 <split [76/28]> Bootstrap1 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
  8. ## 2 <split [76/29]> Bootstrap2 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
  9. ## 3 <split [76/26]> Bootstrap3 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
  10. ## 4 <split [76/27]> Bootstrap4 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>
  11. ## 5 <split [76/27]> Bootstrap5 <tibble[,6] [10 x 6]> <tibble[,1] [0 x 1]>

结果

  1. show_best(ranger_tune, metric = "rmse")
  2. ## # A tibble: 5 x 8
  3. ## mtry min_n .metric .estimator mean n std_err .config
  4. ## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
  5. ## 1 5 9 rmse standard 0.329 5 0.0123 Preprocessor1_Model1
  6. ## 2 4 31 rmse standard 0.331 5 0.0112 Preprocessor1_Model4
  7. ## 3 3 24 rmse standard 0.332 5 0.0109 Preprocessor1_Model5
  8. ## 4 2 37 rmse standard 0.333 5 0.0127 Preprocessor1_Model2
  9. ## 5 3 13 rmse standard 0.334 5 0.0114 Preprocessor1_Model3
  1. show_best(ranger_tune, metric = "rsq")
  2. ## # A tibble: 5 x 8
  3. ## mtry min_n .metric .estimator mean n std_err .config
  4. ## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
  5. ## 1 5 9 rsq standard 0.316 5 0.0421 Preprocessor1_Model1
  6. ## 2 3 13 rsq standard 0.280 5 0.0452 Preprocessor1_Model3
  7. ## 3 4 31 rsq standard 0.264 5 0.0473 Preprocessor1_Model4
  8. ## 4 3 24 rsq standard 0.261 5 0.0464 Preprocessor1_Model5
  9. ## 5 2 37 rsq standard 0.221 5 0.0503 Preprocessor1_Model2
  1. autoplot(ranger_tune)

tidymodels-exercise-06 - 图2

把训练好的参数放进模型中

  1. final_rf <- ranger_workflow %>%
  2. finalize_workflow(select_best(ranger_tune, metric = "rmse"))
  3. final_rf
  4. ## == Workflow ========================================================================================
  5. ## Preprocessor: Recipe
  6. ## Model: rand_forest()
  7. ##
  8. ## -- Preprocessor ------------------------------------------------------------------------------------
  9. ## 2 Recipe Steps
  10. ##
  11. ## * step_other()
  12. ## * step_impute_knn()
  13. ##
  14. ## -- Model -------------------------------------------------------------------------------------------
  15. ## Random Forest Model Specification (regression)
  16. ##
  17. ## Main Arguments:
  18. ## mtry = 5
  19. ## trees = 1000
  20. ## min_n = 9
  21. ##
  22. ## Engine-Specific Arguments:
  23. ## importance = permutation
  24. ##
  25. ## Computational engine: ranger

lase_fit使用训练好的参数重新作用于训练集!!并且!!同时获得测试集的效果

  1. ikea_fit <- last_fit(final_rf, ikea_split)
  2. ikea_fit # 这个`last_fit`使用测试集得出来的
  3. ## # Resampling results
  4. ## # Manual resampling
  5. ## # A tibble: 1 x 6
  6. ## splits id .metrics .notes .predictions .workflow
  7. ## <list> <chr> <list> <list> <list> <list>
  8. ## 1 <split [76~ train/test~ <tibble[,4] [~ <tibble[,1]~ <tibble[,4] [24~ <workflo~
  1. # 这2个都是基于测试集得出的结果
  2. collect_metrics(ikea_fit)
  3. ## # A tibble: 2 x 4
  4. ## .metric .estimator .estimate .config
  5. ## <chr> <chr> <dbl> <chr>
  6. ## 1 rmse standard 0.310 Preprocessor1_Model1
  7. ## 2 rsq standard 0.294 Preprocessor1_Model1
  8. collect_predictions(ikea_fit)
  9. ## # A tibble: 24 x 5
  10. ## id .pred .row price .config
  11. ## <chr> <dbl> <int> <dbl> <chr>
  12. ## 1 train/test split 2.38 5 2.35 Preprocessor1_Model1
  13. ## 2 train/test split 2.27 15 2.54 Preprocessor1_Model1
  14. ## 3 train/test split 2.95 21 3.00 Preprocessor1_Model1
  15. ## 4 train/test split 2.54 23 2.77 Preprocessor1_Model1
  16. ## 5 train/test split 2.27 25 2.54 Preprocessor1_Model1
  17. ## 6 train/test split 2.27 26 2.54 Preprocessor1_Model1
  18. ## 7 train/test split 3.16 31 2.77 Preprocessor1_Model1
  19. ## 8 train/test split 2.61 32 2.64 Preprocessor1_Model1
  20. ## 9 train/test split 3.25 40 3.34 Preprocessor1_Model1
  21. ## 10 train/test split 2.44 41 2.87 Preprocessor1_Model1
  22. ## # ... with 14 more rows
  1. collect_predictions(ikea_fit) %>%
  2. ggplot(aes(price, .pred)) +
  3. geom_abline(lty = 1, color = "red",lwd=2) +
  4. geom_point(alpha = 0.5, color = "midnightblue") +
  5. coord_fixed()

tidymodels-exercise-06 - 图3

现在可以使用训练好的ikea_fit预测其他的数据了

  1. predict(ikea_fit$.workflow[[1]], ikea_test[10, ])
  2. ## # A tibble: 1 x 1
  3. ## .pred
  4. ## <dbl>
  5. ## 1 2.44

使用vip包看看重要性,需要importance这个参数

  1. library(vip)
  2. ##
  3. ## Attaching package: 'vip'
  4. ## The following object is masked from 'package:utils':
  5. ##
  6. ## vi
  7. imp_spec <- ranger_spec %>%
  8. finalize_model(select_best(ranger_tune)) %>%
  9. set_engine("ranger", importance = "permutation")
  10. workflow() %>%
  11. add_recipe(ranger_recipe) %>%
  12. add_model(imp_spec) %>%
  13. fit(ikea_train) %>%
  14. pull_workflow_fit() %>%
  15. vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

tidymodels-exercise-06 - 图4

finalize_model,这里有一个问题,因为原数据是有缺失值的,如果不使用workflow,缺失值就不会被填补,后面fitlast_fit有可能会出错,基于树的模型是不能处理有缺失值的数据的

final_range已经使用了训练好的参数,但是还没有fit过,可以使用一下2种方法fit

现在可以预测新数据了