R包 - plyr包与apply家族 - 《C020101_R》

apply家族
tips-从data.frame里以data.frame取列和以vector取列
dply包
彩蛋
- each、colwise、numwise和ddply函数

关于apply家住大家总是很困惑，我将自己的理解写出来，有更好的建议，欢迎留言。

apply家族

apply(array, margin, …)输入array输出vetor

margin:1表示行， 2表示列

mat = matrix(1:24,nrow = 4,ncol = 6);mat
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    5    9   13   17   21
## [2,]    2    6   10   14   18   22
## [3,]    3    7   11   15   19   23
## [4,]    4    8   12   16   20   24
apply(mat, 1, sum)
## [1] 66 72 78 84

lapply(list, function) 输入list遍历list的元素，返回的也是list。

适合回归函数lm，因为lm返回也是list。

x = list(1:5)
lapply(x , FUN = log) 
#output
[[1]]
[1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379
colnames(iris)
# output
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
[5] "Species"   
lapply(iris[,1:3], function(x) lm(x~iris$Sepal.Width, data = iris))
# output
$Sepal.Length
Call:
lm(formula = x ~ iris$Sepal.Width, data = iris)
Coefficients:
     (Intercept)  iris$Sepal.Width  
          6.5262           -0.2234  
$Sepal.Width
Call:
lm(formula = x ~ iris$Sepal.Width, data = iris)
Coefficients:
     (Intercept)  iris$Sepal.Width  
       2.321e-15         1.000e+00  
$Petal.Length
Call:
lm(formula = x ~ iris$Sepal.Width, data = iris)
Coefficients:
     (Intercept)  iris$Sepal.Width  
           9.063            -1.735

sapply(list, function) 输入list，返回向量和矩阵，数据框

sapply(1:5,log)
#output
[1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379       
sapply(1:5, function(x)(x+3))       
 #output
[1] 4 5 6 7 8

tapply(vector, index, function) 输入vector，输出array

index表示按什么切分，将一个连续变量根据一个分类变量进行切分，并分类汇总

tapply(X = iris$Sepal.Length,INDEX = iris$Species,FUN = mean) #不能使用小写x
#output
setosa versicolor  virginica 
5.006      5.936      6.588

mapply(function, …) 输入函数，输出vector

操作函数，mapply具有向量化操作的功能

myfun = function(x,y) {
  if(x>4) return(y)
  else return(x+y)
}
myfun(1:5,2:6) 直接运行会报错，因为此函数没有向量化操作的功能
#output
[1]  3  5  7  9 11
Warning message:
In if (x > 4) return(y) else return(x + y) :
  条件的长度大于一，因此只能用其第一元素
mapply(myfun, 1:5,2:6)
#output
[1] 3 5 7 9 6

tips-从data.frame里以data.frame取列和以vector取列

> head(iris$Sepal.Length)
[1] 5.1 4.9 4.7 4.6 5.0 5.4
> class(head(iris$Sepal.Length))
[1] "numeric"
> class(head(iris["Sepal.Length"]))
[1] "data.frame"
> head(iris[,1])
[1] 5.1 4.9 4.7 4.6 5.0 5.4
> head(iris["Sepal.Length"]) #data.frame
  Sepal.Length
1          5.1
2          4.9
3          4.7
4          4.6
5          5.0
6          5.4
> class(head(iris["Sepal.Length"]))
[1] "data.frame"
> head(iris[,1:2]) #去两列那自然就是data.frame
  Sepal.Length Sepal.Width
1          5.1         3.5
2          4.9         3.0
3          4.7         3.2
4          4.6         3.1
5          5.0         3.6
6          5.4         3.9
> class(head(iris[,1:2]))
[1] "data.frame"

dply包

以下函数的记忆法：第一个字母为输入数据结构，第二个字母为输出数据结构，a=array，d = dataframe，l=list
aaply(my_matrix,.margins = 2,.fun=mean) 输入array输出array
adply(my_matrix,.margins = 2,.fun=mean) 输入array输出data.frame
laply(my_list,.fun = mean)输入list 输出vector
ddply(.data = my_df,.variables =.(gender),.fun = mean) 输入dataframe，输出dataframe
dlply(iris,~Species,my_mode)适合回归分析,输入数据框，输出list

library(plyr)
my_matrix = matrix(1:24,nrow = 3,ncol = 8)
> my_matrix
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,]    1    4    7   10   13   16   19   22
[2,]    2    5    8   11   14   17   20   23
[3,]    3    6    9   12   15   18   21   24

aaply(my_matrix,.margins = 2,.fun=mean) #输入array输出array

> aaply(my_matrix,.margins = 2,.fun=mean) #输入array输出array
 1  2  3  4  5  6  7  8 
 2  5  8 11 14 17 20 23 
#和apply比较
> apply(my_matrix, 2,mean) #输入array输出vector
[1]  2  5  8 11 14 17 20 23

adply(my_matrix,.margins = 2,.fun=mean) #输入array输出data.frame

> adply(my_matrix,.margins = 2,.fun=mean) #输入array输出data.frame
  X1 V1
1  1  2
2  2  5
3  3  8
4  4 11
5  5 14
6  6 17
7  7 20
8  8 23

laply(my_list,.fun = mean)输入list 输出vector

> my_list = list(1:10,2:8,rep(c(T,F),times = 5))
> laply(my_list,.fun = mean)
[1] 5.5 5.0 0.5
#和lapply比较
> lapply(my_list, mean) # 输入list输出array
[[1]]
[1] 5.5
[[2]]
[1] 5
[[3]]
[1] 0.5

ddply(.data = my_df,.variables =.(gender),.fun = mean) 输入dataframe，输出dataframe

> my_df = data.frame (name= c('Tony','Andy','Bob', 'Mary', 'Leo'),
+ height=c(178,176,175,167,199),
+ gender= c('M','F','F','M','M'),
+ age = c("old","young","young","old","young"))
> ddply(.data =  my_df$sheight,.variables =.(gender),.fun = mean) #报错，此处输入的是vector
NULL
> ddply(.data =  my_df,.variables =.(gender),summarise,
+       mean_h= mean(height))  #.data 输入的是整个dataframe，.(gender)是此包的格式
  gender   mean_h
1      F 175.5000
2      M 181.3333
> ddply(.data =  my_df,.variables =.(gender),summarise,
+       mean_h= mean(height),sd_h = sd(height)) #同时汇总mean 和sd
  gender   mean_h       sd_h
1      F 175.5000  0.7071068
2      M 181.3333 16.2583312
#和tapply比较，输入向量，输出array
> tapply (my_df$height, my_df$gender,mean) 
       F        M 
175.5000 181.3333 
> class(tapply (my_df$height, my_df$gender,mean) )
[1] "array"

> library(reshape2)
> head(tips)
  total_bill  tip    sex smoker day   time size
1      16.99 1.01 Female     No Sun Dinner    2
2      10.34 1.66   Male     No Sun Dinner    3
3      21.01 3.50   Male     No Sun Dinner    3
4      23.68 3.31   Male     No Sun Dinner    2
5      24.59 3.61 Female     No Sun Dinner    4
6      25.29 4.71   Male     No Sun Dinner    4
> ddply(tips,.(sex,smoker),function(x) sum(x$tip)/sum(x$total_bill))
     sex smoker        V1
1 Female     No 0.1531892
2 Female    Yes 0.1630623
3   Male     No 0.1573122
4   Male    Yes 0.1369188
> 
> ddply(tips,~sex+smoker,function(x) sum(x$tip)/sum(x$total_bill)) #另一种写法
     sex smoker        V1
1 Female     No 0.1531892
2 Female    Yes 0.1630623
3   Male     No 0.1573122
4   Male    Yes 0.1369188

dlply(iris,~Species,my_mode)适合回归分析,输入数据框，输出list

lapply() 返回list 回归分析返回的是list，所有lapply适合回归分析
dlply适合回归分析,输入数据框，输出list

> my_mode = function(x) lm(Sepal.Length~Sepal.Width,data = x)
> dlply(iris,~Species,my_mode) #iris是数据，~Species是分类变量，my_mode是函数输出的是list
$setosa
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = x)
Coefficients:
(Intercept)  Sepal.Width  
     2.6390       0.6905  
$versicolor
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = x)
Coefficients:
(Intercept)  Sepal.Width  
     3.5397       0.8651  
$virginica
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = x)
Coefficients:
(Intercept)  Sepal.Width  
     3.9068       0.9015  
attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
     Species
1     setosa
2 versicolor
3  virginica

彩蛋

each、colwise、numwise和ddply函数

> each(mean,sd,median)(iris$Sepal.Length) #一次性的对某个对象（第二个括号）进行多种操作
     mean        sd    median 
5.8433333 0.8280661 5.8000000 
> colwise(mean)(iris)#对列进行操作
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1     5.843333    3.057333        3.758    1.199333      NA
Warning message:
In mean.default(X[[i]], ...) :
  argument is not numeric or logical: returning NA
> numcolwise(mean)(iris) #只对为数据的列进行操作
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1     5.843333    3.057333        3.758    1.199333
> #ddply的好处
> #写法一
> ddply(iris,~Species,colwise(mean,c('Sepal.Length','Sepal.Width')))
     Species Sepal.Length Sepal.Width
1     setosa        5.006       3.428
2 versicolor        5.936       2.770
3  virginica        6.588       2.974
> #写法二
> ddply(iris,~Species,colwise(mean,.(Sepal.Length,Sepal.Width)))
     Species Sepal.Length Sepal.Width
1     setosa        5.006       3.428
2 versicolor        5.936       2.770
3  virginica        6.588       2.974
> #写法三
> ddply(iris,~Species,colwise(mean,~Sepal.Length+Sepal.Width))
     Species Sepal.Length Sepal.Width
1     setosa        5.006       3.428
2 versicolor        5.936       2.770
3  virginica        6.588       2.974

参考总结图