tidymodels-exercise-01

liyue

Last compiled on 04 七月, 2021

探索数据

  1. library(tidyverse)
  2. ## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
  3. ## v ggplot2 3.3.3 v purrr 0.3.4
  4. ## v tibble 3.1.1 v dplyr 1.0.6
  5. ## v tidyr 1.1.3 v stringr 1.4.0
  6. ## v readr 1.4.0 v forcats 0.5.1
  7. ## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
  8. ## x dplyr::filter() masks stats::filter()
  9. ## x dplyr::lag() masks stats::lag()

首先读取数据,在这个出勤率的数据中,weekly_attendance是需要预测的结果。

  1. attendance <- read_csv("../datasets/tidytuesday/data/2020/2020-02-04/attendance.csv")
  2. ##
  3. ## -- Column specification --------------------------------------------------------
  4. ## cols(
  5. ## team = col_character(),
  6. ## team_name = col_character(),
  7. ## year = col_double(),
  8. ## total = col_double(),
  9. ## home = col_double(),
  10. ## away = col_double(),
  11. ## week = col_double(),
  12. ## weekly_attendance = col_double()
  13. ## )
  14. standings <- read_csv("../datasets/tidytuesday/data/2020/2020-02-04/standings.csv")
  15. ##
  16. ## -- Column specification --------------------------------------------------------
  17. ## cols(
  18. ## team = col_character(),
  19. ## team_name = col_character(),
  20. ## year = col_double(),
  21. ## wins = col_double(),
  22. ## loss = col_double(),
  23. ## points_for = col_double(),
  24. ## points_against = col_double(),
  25. ## points_differential = col_double(),
  26. ## margin_of_victory = col_double(),
  27. ## strength_of_schedule = col_double(),
  28. ## simple_rating = col_double(),
  29. ## offensive_ranking = col_double(),
  30. ## defensive_ranking = col_double(),
  31. ## playoffs = col_character(),
  32. ## sb_winner = col_character()
  33. ## )
  34. attendance_joined <- attendance %>%
  35. left_join(standings,
  36. by = c("year", "team_name", "team")
  37. )
  38. attendance_joined
  39. ## # A tibble: 10,846 x 20
  40. ## team team_name year total home away week weekly_attendan~ wins loss
  41. ## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
  42. ## 1 Ariz~ Cardinals 2000 893926 387475 506451 1 77434 3 13
  43. ## 2 Ariz~ Cardinals 2000 893926 387475 506451 2 66009 3 13
  44. ## 3 Ariz~ Cardinals 2000 893926 387475 506451 3 NA 3 13
  45. ## 4 Ariz~ Cardinals 2000 893926 387475 506451 4 71801 3 13
  46. ## 5 Ariz~ Cardinals 2000 893926 387475 506451 5 66985 3 13
  47. ## 6 Ariz~ Cardinals 2000 893926 387475 506451 6 44296 3 13
  48. ## 7 Ariz~ Cardinals 2000 893926 387475 506451 7 38293 3 13
  49. ## 8 Ariz~ Cardinals 2000 893926 387475 506451 8 62981 3 13
  50. ## 9 Ariz~ Cardinals 2000 893926 387475 506451 9 35286 3 13
  51. ## 10 Ariz~ Cardinals 2000 893926 387475 506451 10 52244 3 13
  52. ## # ... with 10,836 more rows, and 10 more variables: points_for <dbl>,
  53. ## # points_against <dbl>, points_differential <dbl>, margin_of_victory <dbl>,
  54. ## # strength_of_schedule <dbl>, simple_rating <dbl>, offensive_ranking <dbl>,
  55. ## # defensive_ranking <dbl>, playoffs <chr>, sb_winner <chr>

看看不同队伍之间出勤率的差别,以及有无季后赛的影响?

  1. attendance_joined %>%
  2. filter(!is.na(weekly_attendance)) %>%
  3. ggplot(., aes(fct_reorder(team_name, weekly_attendance), weekly_attendance, fill = playoffs)) +
  4. geom_boxplot(outlier.alpha = 0.3) +
  5. labs(fill = NULL, x = NULL, y = "weekly attendance") +
  6. theme(legend.position = "bottom") +
  7. theme_bw() +
  8. coord_flip()

image.png
不同的周对出勤率有没有影响?

  1. attendance_joined %>%
  2. mutate(week = factor(week)) %>%
  3. ggplot(., aes(week, weekly_attendance, fill = week)) +
  4. geom_boxplot(show.legend = F, outlier.alpha = 0.3) +
  5. labs(x = "week", y = "weekly attendance")+
  6. theme_bw()

image.png

建立模型

首先删除结果变量(weekly attendance)是NA的行,并选择想作为预测变量的列。

  1. attendance_df <- attendance_joined %>%
  2. filter(!is.na(weekly_attendance)) %>%
  3. select(weekly_attendance, team_name, year, week, margin_of_victory, strength_of_schedule, playoffs)
  4. attendance_df
  5. ## # A tibble: 10,208 x 7
  6. ## weekly_attendance team_name year week margin_of_victory strength_of_schedu~
  7. ## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
  8. ## 1 77434 Cardinals 2000 1 -14.6 -0.7
  9. ## 2 66009 Cardinals 2000 2 -14.6 -0.7
  10. ## 3 71801 Cardinals 2000 4 -14.6 -0.7
  11. ## 4 66985 Cardinals 2000 5 -14.6 -0.7
  12. ## 5 44296 Cardinals 2000 6 -14.6 -0.7
  13. ## 6 38293 Cardinals 2000 7 -14.6 -0.7
  14. ## 7 62981 Cardinals 2000 8 -14.6 -0.7
  15. ## 8 35286 Cardinals 2000 9 -14.6 -0.7
  16. ## 9 52244 Cardinals 2000 10 -14.6 -0.7
  17. ## 10 64223 Cardinals 2000 11 -14.6 -0.7
  18. ## # ... with 10,198 more rows, and 1 more variable: playoffs <chr>

分割数据

  1. library(tidymodels)
  2. ## -- Attaching packages -------------------------------------- tidymodels 0.1.3 --
  3. ## v broom 0.7.6 v rsample 0.0.9
  4. ## v dials 0.0.9 v tune 0.1.5
  5. ## v infer 0.5.4 v workflows 0.2.2
  6. ## v modeldata 0.1.0 v workflowsets 0.0.2
  7. ## v parsnip 0.1.5 v yardstick 0.0.8
  8. ## v recipes 0.1.16
  9. ## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
  10. ## x scales::discard() masks purrr::discard()
  11. ## x dplyr::filter() masks stats::filter()
  12. ## x recipes::fixed() masks stringr::fixed()
  13. ## x dplyr::lag() masks stats::lag()
  14. ## x yardstick::spec() masks readr::spec()
  15. ## x recipes::step() masks stats::step()
  16. ## * Use tidymodels_prefer() to resolve common conflicts.
  17. set.seed(123)
  18. attendance_split <- attendance_df %>%
  19. initial_split(strata = playoffs)
  20. nfl_train <- training(attendance_split)
  21. nfl_test <- testing(attendance_split)

建立一个线性回归模型

  1. lm_spec <- linear_reg() %>% set_engine("lm")
  2. lm_fit <- lm_spec %>% fit(weekly_attendance ~ ., data = nfl_train)
  3. lm_fit
  4. ## parsnip model object
  5. ##
  6. ## Fit time: 31ms
  7. ##
  8. ## Call:
  9. ## stats::lm(formula = weekly_attendance ~ ., data = data)
  10. ##
  11. ## Coefficients:
  12. ## (Intercept) team_nameBears team_nameBengals
  13. ## -104175.10 -3112.97 -5261.15
  14. ## team_nameBills team_nameBroncos team_nameBrowns
  15. ## -465.98 3157.94 -248.44
  16. ## team_nameBuccaneers team_nameCardinals team_nameChargers
  17. ## -3585.25 -6652.83 -5165.30
  18. ## team_nameChiefs team_nameColts team_nameCowboys
  19. ## 1314.75 -3654.27 6141.39
  20. ## team_nameDolphins team_nameEagles team_nameFalcons
  21. ## 312.73 1345.57 -398.50
  22. ## team_nameGiants team_nameJaguars team_nameJets
  23. ## 5637.37 -3189.05 3914.46
  24. ## team_nameLions team_namePackers team_namePanthers
  25. ## -3190.57 1181.02 1886.74
  26. ## team_namePatriots team_nameRaiders team_nameRams
  27. ## -262.90 -5526.03 -2582.71
  28. ## team_nameRavens team_nameRedskins team_nameSaints
  29. ## -501.41 6537.25 130.27
  30. ## team_nameSeahawks team_nameSteelers team_nameTexans
  31. ## -1962.10 -3343.01 85.38
  32. ## team_nameTitans team_nameVikings year
  33. ## -1101.85 -2633.05 86.13
  34. ## week margin_of_victory strength_of_schedule
  35. ## -68.54 127.51 238.26
  36. ## playoffsPlayoffs
  37. ## -171.49

建立一个随机森林回归模型

  1. rf_spec <- rand_forest(mode = "regression") %>% set_engine("ranger")
  2. rf_fit <- rf_spec %>% fit(weekly_attendance ~ ., data = nfl_train)
  3. rf_fit
  4. ## parsnip model object
  5. ##
  6. ## Fit time: 4.8s
  7. ## Ranger result
  8. ##
  9. ## Call:
  10. ## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
  11. ##
  12. ## Type: Regression
  13. ## Number of trees: 500
  14. ## Sample size: 7656
  15. ## Number of independent variables: 6
  16. ## Mtry: 2
  17. ## Target node size: 5
  18. ## Variable importance mode: none
  19. ## Splitrule: variance
  20. ## OOB prediction error (MSE): 74791027
  21. ## R squared (OOB): 0.06967497

评价模型

使用测试集评价模型

  1. # 下面这段合并结果的代码可以用于很多模型,值得学习
  2. results_train <- lm_fit %>%
  3. predict(new_data = nfl_train) %>%
  4. mutate(
  5. truth = nfl_train$weekly_attendance,
  6. model = "lm"
  7. ) %>%
  8. bind_rows(rf_fit %>%
  9. predict(new_data = nfl_train) %>%
  10. mutate(
  11. truth = nfl_train$weekly_attendance,
  12. model = "rf"
  13. ))
  14. results_train
  15. ## # A tibble: 15,312 x 3
  16. ## .pred truth model
  17. ## <dbl> <dbl> <chr>
  18. ## 1 59263. 66009 lm
  19. ## 2 59126. 71801 lm
  20. ## 3 59058. 66985 lm
  21. ## 4 58989. 44296 lm
  22. ## 5 58920. 38293 lm
  23. ## 6 58852. 62981 lm
  24. ## 7 58783. 35286 lm
  25. ## 8 58715. 52244 lm
  26. ## 9 58646. 64223 lm
  27. ## 10 58578. 65356 lm
  28. ## # ... with 15,302 more rows
  29. results_test <- lm_fit %>%
  30. predict(new_data = nfl_test) %>%
  31. mutate(truth = nfl_test$weekly_attendance,
  32. model = "lm"
  33. ) %>%
  34. bind_rows(rf_fit %>%
  35. predict(new_data = nfl_test
  36. )
  37. %>%
  38. mutate(truth = nfl_test$weekly_attendance,
  39. model = "rf"
  40. )
  41. )
  42. results_test
  43. ## # A tibble: 5,104 x 3
  44. ## .pred truth model
  45. ## <dbl> <dbl> <chr>
  46. ## 1 59332. 77434 lm
  47. ## 2 65999. 74309 lm
  48. ## 3 65656. 64900 lm
  49. ## 4 68015. 68843 lm
  50. ## 5 67809. 73018 lm
  51. ## 6 67672. 83252 lm
  52. ## 7 67261. 68361 lm
  53. ## 8 67654. 77884 lm
  54. ## 9 67174. 60292 lm
  55. ## 10 66968. 65546 lm
  56. ## # ... with 5,094 more rows

用rmse看看效果

  1. results_train %>%
  2. group_by(model) %>%
  3. rmse(truth = truth, estimate = .pred)
  4. ## # A tibble: 2 x 4
  5. ## model .metric .estimator .estimate
  6. ## <chr> <chr> <chr> <dbl>
  7. ## 1 lm rmse standard 8267.
  8. ## 2 rf rmse standard 6090.
  9. results_test %>%
  10. group_by(model) %>%
  11. rmse(truth = truth, estimate = .pred)
  12. ## # A tibble: 2 x 4
  13. ## model .metric .estimator .estimate
  14. ## <chr> <chr> <chr> <dbl>
  15. ## 1 lm rmse standard 8471.
  16. ## 2 rf rmse standard 8639.

看样子结果不太好

可视化结果

  1. results_test %>%
  2. mutate(train = "testing") %>%
  3. bind_rows(results_train %>%
  4. mutate(train = "training")) %>%
  5. ggplot(aes(truth, .pred, color = model)) +
  6. geom_abline(lty = 2, color = "gray80", size = 1.5) +
  7. geom_point(alpha = 0.5) +
  8. facet_wrap(~train) +
  9. labs(
  10. x = "Truth",
  11. y = "Predicted attendance",
  12. color = "Type of model"
  13. )+
  14. theme_bw()

image.png

使用交叉验证再试一次

  1. set.seed(123)
  2. nfl_folds <- vfold_cv(nfl_train, strata = playoffs)
  3. rf_res <- fit_resamples(
  4. rf_spec,
  5. weekly_attendance ~ .,
  6. nfl_folds,
  7. control = control_resamples(save_pred = TRUE)
  8. )
  9. rf_res %>%
  10. collect_metrics()
  11. ## # A tibble: 2 x 6
  12. ## .metric .estimator mean n std_err .config
  13. ## <chr> <chr> <dbl> <int> <dbl> <chr>
  14. ## 1 rmse standard 8616. 10 127. Preprocessor1_Model1
  15. ## 2 rsq standard 0.112 10 0.0112 Preprocessor1_Model1

可视化结果

  1. rf_res %>%
  2. unnest(.predictions) %>%
  3. ggplot(aes(weekly_attendance, .pred, color = id)) +
  4. geom_abline(lty = 2, color = "gray80", size = 1.5) +
  5. geom_point(alpha = 0.5) +
  6. labs(
  7. x = "Truth",
  8. y = "Predicted game attendance",
  9. color = NULL
  10. )+
  11. theme_bw()

image.png