例如我们根据某个分组,计算他的多个分位数
> library(dplyr)
> library(tidyr)
>
> Z <- data.frame(x = runif(1000, min = 0, max = 20)) %>%
+ mutate(y = rnorm(n(), mean = sin(x))) %>%
+ group_by(x.category = round(x))
> Z
# A tibble: 1,000 x 3
# Groups: x.category [21]
x y x.category
<dbl> <dbl> <dbl>
1 0.670 0.121 1
2 16.5 0.0702 16
3 15.0 -1.47 15
4 3.16 -0.595 3
5 12.7 -0.915 13
6 5.25 -0.540 5
7 3.82 -0.671 4
8 10.6 -2.33 11
9 18.3 1.15 18
10 1.53 0.205 2
# … with 990 more rows
Z %>%
summarize(x = mean(x),
y25 = quantile(y, probs = .25),
y50 = quantile(y, probs = .5),
y75 = quantile(y, probs = .75)) %>%
gather(Statistic, y, -x, -x.category)
#上述是比较繁琐的方法,如果有多个分位数可能就要写很多次。
#那么如何一次性生成多列呢
解法一
probs <- c(0.25, 0.5, 0.75)
Z %>%
summarize(x = mean(x),
quantile = list(quantile(y,probs)),
prob = list(probs)) %>%
unnest(cols = c("quantile", "prob"))
# A tibble: 63 x 4
# x.category x quantile prob
# <dbl> <dbl> <dbl> <dbl>
# 1 0 0.260 -0.527 0.25
# 2 0 0.260 0.247 0.5
# 3 0 0.260 0.704 0.75
# 4 1 0.997 0.449 0.25
# 5 1 0.997 0.912 0.5
# 6 1 0.997 1.61 0.75
# 7 2 2.00 -0.219 0.25
# 8 2 2.00 0.531 0.5
# 9 2 2.00 1.48 0.75
# 10 3 2.93 -0.498 0.25
# ... with 53 more rows
解法二
q = c(0.25, 0.5, 0.75)
Z %>%
summarise(x = mean(x),
qtls = paste(quantile(y, q), collapse = ",")) %>%
separate(qtls, paste0("y_", 100*q), sep = ",", convert = T)