dplyr中提供的动词语法与其他操作进行连接,可以实现更加复杂的数据操作

group操作

  1. # group操作
  2. by_tailnum <- group_by(flights, tailnum)
  3. # group操作
  4. delay <- summarise(by_tailnum,
  5. count = n(),
  6. dist = mean(distance, na.rm = TRUE),
  7. delay = mean(arr_delay, na.rm = TRUE))
  1. delay
  1. ## # A tibble: 4,044 x 4
  2. ## tailnum count dist delay
  3. ## <chr> <int> <dbl> <dbl>
  4. ## 1 D942DN 4 854. 31.5
  5. ## 2 N0EGMQ 371 676. 9.98
  6. ## 3 N10156 153 758. 12.7
  7. ## 4 N102UW 48 536. 2.94
  8. ## 5 N103US 46 535. -6.93
  9. ## 6 N104UW 47 535. 1.80
  10. ## 7 N10575 289 520. 20.7
  11. ## 8 N105UW 45 525. -0.267
  12. ## 9 N107US 41 529. -5.73
  13. ## 10 N108UW 60 534. -1.25
  14. ## # ... with 4,034 more rows

summarise()函数中可以与基础的R函数(如:min())进行联合操作,同时dplyr同样提供了一系列的快捷操作:

  • n(): 观测值的个数
  • n_distinct(x): unique的观测值数目
  • first(x),last(x), nth(x, n): 第一个、最后一个、第n个观测值
  1. destinations <- group_by(flights, dest)
  2. summarise(destinations,
  3. planes = n_distinct(tailnum),
  4. flights = n()
  5. )
  1. ## # A tibble: 105 x 3
  2. ## dest planes flights
  3. ## <chr> <int> <int>
  4. ## 1 ABQ 108 254
  5. ## 2 ACK 58 265
  6. ## 3 ALB 172 439
  7. ## 4 ANC 6 8
  8. ## 5 ATL 1180 17215
  9. ## 6 AUS 993 2439
  10. ## 7 AVL 159 275
  11. ## 8 BDL 186 443
  12. ## 9 BGR 46 375
  13. ## 10 BHM 45 297
  14. ## # ... with 95 more rows

当依据多个变量进行分组时,可以根据summarise()层层递进,得出结果

  1. daily <- group_by(flights, year, month, day)
  2. per_day <- summarise(daily, flights = n())
  1. per_month <- summarise(per_day, flights = sum(flights))
  1. per_year <- summarise(per_month, flights = sum(flights))

select 操作

select函数可以接受数字和具体的列名,但需要注意的是,当有与列名相同的其他赋值时,不能进行操作

  1. select(flights, year)
  1. ## # A tibble: 336,776 x 1
  2. ## year
  3. ## <int>
  4. ## 1 2013
  5. ## 2 2013
  6. ## 3 2013
  7. ## 4 2013
  8. ## 5 2013
  9. ## 6 2013
  10. ## 7 2013
  11. ## 8 2013
  12. ## 9 2013
  13. ## 10 2013
  14. ## # ... with 336,766 more rows
  1. select(flights, 1)
  1. ## # A tibble: 336,776 x 1
  2. ## year
  3. ## <int>
  4. ## 1 2013
  5. ## 2 2013
  6. ## 3 2013
  7. ## 4 2013
  8. ## 5 2013
  9. ## 6 2013
  10. ## 7 2013
  11. ## 8 2013
  12. ## 9 2013
  13. ## 10 2013
  14. ## # ... with 336,766 more rows
  1. year <- 5
  2. select(flights, year)
  1. ## # A tibble: 336,776 x 1
  2. ## year
  3. ## <int>
  4. ## 1 2013
  5. ## 2 2013
  6. ## 3 2013
  7. ## 4 2013
  8. ## 5 2013
  9. ## 6 2013
  10. ## 7 2013
  11. ## 8 2013
  12. ## 9 2013
  13. ## 10 2013
  14. ## # ... with 336,766 more rows

Mutating 操作

  1. df <- select(flights, year:dep_time)
  2. mutate(df, "year", 2)
  1. ## # A tibble: 336,776 x 6
  2. ## year month day dep_time `"year"` `2`
  3. ## <int> <int> <int> <int> <chr> <dbl>
  4. ## 1 2013 1 1 517 year 2
  5. ## 2 2013 1 1 533 year 2
  6. ## 3 2013 1 1 542 year 2
  7. ## 4 2013 1 1 544 year 2
  8. ## 5 2013 1 1 554 year 2
  9. ## 6 2013 1 1 554 year 2
  10. ## 7 2013 1 1 555 year 2
  11. ## 8 2013 1 1 557 year 2
  12. ## 9 2013 1 1 557 year 2
  13. ## 10 2013 1 1 558 year 2
  14. ## # ... with 336,766 more rows
  1. mutate(df, year + 10)
  1. ## # A tibble: 336,776 x 5
  2. ## year month day dep_time `year + 10`
  3. ## <int> <int> <int> <int> <dbl>
  4. ## 1 2013 1 1 517 2023
  5. ## 2 2013 1 1 533 2023
  6. ## 3 2013 1 1 542 2023
  7. ## 4 2013 1 1 544 2023
  8. ## 5 2013 1 1 554 2023
  9. ## 6 2013 1 1 554 2023
  10. ## 7 2013 1 1 555 2023
  11. ## 8 2013 1 1 557 2023
  12. ## 9 2013 1 1 557 2023
  13. ## 10 2013 1 1 558 2023
  14. ## # ... with 336,766 more rows
  1. var <- seq(1, nrow(df))
  2. mutate(df, new = var)
  1. ## # A tibble: 336,776 x 5
  2. ## year month day dep_time new
  3. ## <int> <int> <int> <int> <int>
  4. ## 1 2013 1 1 517 1
  5. ## 2 2013 1 1 533 2
  6. ## 3 2013 1 1 542 3
  7. ## 4 2013 1 1 544 4
  8. ## 5 2013 1 1 554 5
  9. ## 6 2013 1 1 554 6
  10. ## 7 2013 1 1 555 7
  11. ## 8 2013 1 1 557 8
  12. ## 9 2013 1 1 557 9
  13. ## 10 2013 1 1 558 10
  14. ## # ... with 336,766 more rows

group_by()的操作同样可以修改和创建变量,不同的时该变量是group之后的

  1. group_by(df, month)
  1. ## # A tibble: 336,776 x 4
  2. ## # Groups: month [12]
  3. ## year month day dep_time
  4. ## <int> <int> <int> <int>
  5. ## 1 2013 1 1 517
  6. ## 2 2013 1 1 533
  7. ## 3 2013 1 1 542
  8. ## 4 2013 1 1 544
  9. ## 5 2013 1 1 554
  10. ## 6 2013 1 1 554
  11. ## 7 2013 1 1 555
  12. ## 8 2013 1 1 557
  13. ## 9 2013 1 1 557
  14. ## 10 2013 1 1 558
  15. ## # ... with 336,766 more rows
  1. group_by(df, month = as.factor(month))
  1. ## # A tibble: 336,776 x 4
  2. ## # Groups: month [12]
  3. ## year month day dep_time
  4. ## <int> <fct> <int> <int>
  5. ## 1 2013 1 1 517
  6. ## 2 2013 1 1 533
  7. ## 3 2013 1 1 542
  8. ## 4 2013 1 1 544
  9. ## 5 2013 1 1 554
  10. ## 6 2013 1 1 554
  11. ## 7 2013 1 1 555
  12. ## 8 2013 1 1 557
  13. ## 9 2013 1 1 557
  14. ## 10 2013 1 1 558
  15. ## # ... with 336,766 more rows
  1. group_by(df, day_binned = cut(day, 3))
  1. ## # A tibble: 336,776 x 5
  2. ## # Groups: day_binned [3]
  3. ## year month day dep_time day_binned
  4. ## <int> <int> <int> <int> <fct>
  5. ## 1 2013 1 1 517 (0.97,11]
  6. ## 2 2013 1 1 533 (0.97,11]
  7. ## 3 2013 1 1 542 (0.97,11]
  8. ## 4 2013 1 1 544 (0.97,11]
  9. ## 5 2013 1 1 554 (0.97,11]
  10. ## 6 2013 1 1 554 (0.97,11]
  11. ## 7 2013 1 1 555 (0.97,11]
  12. ## 8 2013 1 1 557 (0.97,11]
  13. ## 9 2013 1 1 557 (0.97,11]
  14. ## 10 2013 1 1 558 (0.97,11]
  15. ## # ... with 336,766 more rows

pipe 操作

当进行多重的动词语法操作时,可以把一系列的操作进行融合,不过因此会损害代码的可读性

  1. # 鎸笺父鏄笺工鎴笺父鏄笺付鎸笺功鏀笺赴同鎴笺傅牟鎼笺腹鎼笺阜鏄笺阜
  2. # method 1
  3. a1 <- group_by(flights, year, month, day)
  4. a2 <- select(a1, arr_delay, dep_delay)
  1. ## Adding missing grouping variables: `year`, `month`, `day`
  1. a3 <- summarise(a2,
  2. arr = mean(arr_delay, na.rm = TRUE),
  3. dep = mean(dep_delay, na.rm = TRUE))
  1. ## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
  1. a4 <- filter(a3, arr > 30 | dep > 30)
  2. a4
  1. ## # A tibble: 49 x 5
  2. ## # Groups: year, month [11]
  3. ## year month day arr dep
  4. ## <int> <int> <int> <dbl> <dbl>
  5. ## 1 2013 1 16 34.2 24.6
  6. ## 2 2013 1 31 32.6 28.7
  7. ## 3 2013 2 11 36.3 39.1
  8. ## 4 2013 2 27 31.3 37.8
  9. ## 5 2013 3 8 85.9 83.5
  10. ## 6 2013 3 18 41.3 30.1
  11. ## 7 2013 4 10 38.4 33.0
  12. ## 8 2013 4 12 36.0 34.8
  13. ## 9 2013 4 18 36.0 34.9
  14. ## 10 2013 4 19 47.9 46.1
  15. ## # ... with 39 more rows
  1. # method 2
  2. filter(
  3. summarise(
  4. select(
  5. group_by(flights, year, month, day),
  6. arr_delay, dep_delay
  7. ),
  8. arr = mean(arr_delay, na.rm = TRUE),
  9. dep = mean(dep_delay, na.rm = TRUE)
  10. ),
  11. arr > 30 | dep > 30
  12. )
  1. ## Adding missing grouping variables: `year`, `month`, `day`
  1. ## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
  1. ## # A tibble: 49 x 5
  2. ## # Groups: year, month [11]
  3. ## year month day arr dep
  4. ## <int> <int> <int> <dbl> <dbl>
  5. ## 1 2013 1 16 34.2 24.6
  6. ## 2 2013 1 31 32.6 28.7
  7. ## 3 2013 2 11 36.3 39.1
  8. ## 4 2013 2 27 31.3 37.8
  9. ## 5 2013 3 8 85.9 83.5
  10. ## 6 2013 3 18 41.3 30.1
  11. ## 7 2013 4 10 38.4 33.0
  12. ## 8 2013 4 12 36.0 34.8
  13. ## 9 2013 4 18 36.0 34.9
  14. ## 10 2013 4 19 47.9 46.1
  15. ## # ... with 39 more rows
  1. # method 3
  2. flights %>%
  3. group_by(year, month, day) %>%
  4. select(arr_delay, dep_delay) %>%
  5. summarise(
  6. arr = mean(arr_delay, na.rm = TRUE),
  7. dep = mean(dep_delay, na.rm = TRUE)
  8. ) %>%
  9. filter(arr > 30 | dep > 30)
  1. ## Adding missing grouping variables: `year`, `month`, `day`
  2. ## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
  1. ## # A tibble: 49 x 5
  2. ## # Groups: year, month [11]
  3. ## year month day arr dep
  4. ## <int> <int> <int> <dbl> <dbl>
  5. ## 1 2013 1 16 34.2 24.6
  6. ## 2 2013 1 31 32.6 28.7
  7. ## 3 2013 2 11 36.3 39.1
  8. ## 4 2013 2 27 31.3 37.8
  9. ## 5 2013 3 8 85.9 83.5
  10. ## 6 2013 3 18 41.3 30.1
  11. ## 7 2013 4 10 38.4 33.0
  12. ## 8 2013 4 12 36.0 34.8
  13. ## 9 2013 4 18 36.0 34.9
  14. ## 10 2013 4 19 47.9 46.1
  15. ## # ... with 39 more rows