关于apply家住大家总是很困惑,我将自己的理解写出来,有更好的建议,欢迎留言。

apply家族

apply(array, margin, …)输入array输出vetor

margin:1表示行, 2表示列

  1. mat = matrix(1:24,nrow = 4,ncol = 6);mat
  2. ## [,1] [,2] [,3] [,4] [,5] [,6]
  3. ## [1,] 1 5 9 13 17 21
  4. ## [2,] 2 6 10 14 18 22
  5. ## [3,] 3 7 11 15 19 23
  6. ## [4,] 4 8 12 16 20 24
  7. apply(mat, 1, sum)
  8. ## [1] 66 72 78 84

lapply(list, function) 输入list遍历list的元素,返回的也是list。

适合回归函数lm,因为lm返回也是list。

  1. x = list(1:5)
  2. lapply(x , FUN = log)
  3. #output
  4. [[1]]
  5. [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379
  6. colnames(iris)
  7. # output
  8. [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
  9. [5] "Species"
  10. lapply(iris[,1:3], function(x) lm(x~iris$Sepal.Width, data = iris))
  11. # output
  12. $Sepal.Length
  13. Call:
  14. lm(formula = x ~ iris$Sepal.Width, data = iris)
  15. Coefficients:
  16. (Intercept) iris$Sepal.Width
  17. 6.5262 -0.2234
  18. $Sepal.Width
  19. Call:
  20. lm(formula = x ~ iris$Sepal.Width, data = iris)
  21. Coefficients:
  22. (Intercept) iris$Sepal.Width
  23. 2.321e-15 1.000e+00
  24. $Petal.Length
  25. Call:
  26. lm(formula = x ~ iris$Sepal.Width, data = iris)
  27. Coefficients:
  28. (Intercept) iris$Sepal.Width
  29. 9.063 -1.735

sapply(list, function) 输入list,返回向量和矩阵,数据框

  1. sapply(1:5,log)
  2. #output
  3. [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379
  4. sapply(1:5, function(x)(x+3))
  5. #output
  6. [1] 4 5 6 7 8

tapply(vector, index, function) 输入vector,输出array

index表示按什么切分,将一个连续变量根据一个分类变量进行切分,并分类汇总

  1. tapply(X = iris$Sepal.Length,INDEX = iris$Species,FUN = mean) #不能使用小写x
  2. #output
  3. setosa versicolor virginica
  4. 5.006 5.936 6.588

mapply(function, …) 输入函数,输出vector

操作函数,mapply具有向量化操作的功能

  1. myfun = function(x,y) {
  2. if(x>4) return(y)
  3. else return(x+y)
  4. }
  5. myfun(1:5,2:6) 直接运行会报错,因为此函数没有向量化操作的功能
  6. #output
  7. [1] 3 5 7 9 11
  8. Warning message:
  9. In if (x > 4) return(y) else return(x + y) :
  10. 条件的长度大于一,因此只能用其第一元素
  11. mapply(myfun, 1:5,2:6)
  12. #output
  13. [1] 3 5 7 9 6

tips-从data.frame里以data.frame取列和以vector取列

  1. > head(iris$Sepal.Length)
  2. [1] 5.1 4.9 4.7 4.6 5.0 5.4
  3. > class(head(iris$Sepal.Length))
  4. [1] "numeric"
  5. > class(head(iris["Sepal.Length"]))
  6. [1] "data.frame"
  7. > head(iris[,1])
  8. [1] 5.1 4.9 4.7 4.6 5.0 5.4
  9. > head(iris["Sepal.Length"]) #data.frame
  10. Sepal.Length
  11. 1 5.1
  12. 2 4.9
  13. 3 4.7
  14. 4 4.6
  15. 5 5.0
  16. 6 5.4
  17. > class(head(iris["Sepal.Length"]))
  18. [1] "data.frame"
  19. > head(iris[,1:2]) #去两列那自然就是data.frame
  20. Sepal.Length Sepal.Width
  21. 1 5.1 3.5
  22. 2 4.9 3.0
  23. 3 4.7 3.2
  24. 4 4.6 3.1
  25. 5 5.0 3.6
  26. 6 5.4 3.9
  27. > class(head(iris[,1:2]))
  28. [1] "data.frame"

dply包

以下函数的记忆法:第一个字母为输入数据结构,第二个字母为输出数据结构,a=array,d = dataframe,l=list
aaply(my_matrix,.margins = 2,.fun=mean) 输入array输出array
adply(my_matrix,.margins = 2,.fun=mean) 输入array输出data.frame
laply(my_list,.fun = mean)输入list 输出vector
ddply(.data = my_df,.variables =.(gender),.fun = mean) 输入dataframe,输出dataframe
dlply(iris,~Species,my_mode)适合回归分析,输入数据框,输出list

  1. library(plyr)
  2. my_matrix = matrix(1:24,nrow = 3,ncol = 8)
  3. > my_matrix
  4. [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
  5. [1,] 1 4 7 10 13 16 19 22
  6. [2,] 2 5 8 11 14 17 20 23
  7. [3,] 3 6 9 12 15 18 21 24

aaply(my_matrix,.margins = 2,.fun=mean) #输入array输出array

  1. > aaply(my_matrix,.margins = 2,.fun=mean) #输入array输出array
  2. 1 2 3 4 5 6 7 8
  3. 2 5 8 11 14 17 20 23
  4. #和apply比较
  5. > apply(my_matrix, 2,mean) #输入array输出vector
  6. [1] 2 5 8 11 14 17 20 23

adply(my_matrix,.margins = 2,.fun=mean) #输入array输出data.frame

  1. > adply(my_matrix,.margins = 2,.fun=mean) #输入array输出data.frame
  2. X1 V1
  3. 1 1 2
  4. 2 2 5
  5. 3 3 8
  6. 4 4 11
  7. 5 5 14
  8. 6 6 17
  9. 7 7 20
  10. 8 8 23

laply(my_list,.fun = mean)输入list 输出vector

  1. > my_list = list(1:10,2:8,rep(c(T,F),times = 5))
  2. > laply(my_list,.fun = mean)
  3. [1] 5.5 5.0 0.5
  4. #和lapply比较
  5. > lapply(my_list, mean) # 输入list输出array
  6. [[1]]
  7. [1] 5.5
  8. [[2]]
  9. [1] 5
  10. [[3]]
  11. [1] 0.5

ddply(.data = my_df,.variables =.(gender),.fun = mean) 输入dataframe,输出dataframe

  1. > my_df = data.frame (name= c('Tony','Andy','Bob', 'Mary', 'Leo'),
  2. + height=c(178,176,175,167,199),
  3. + gender= c('M','F','F','M','M'),
  4. + age = c("old","young","young","old","young"))
  5. > ddply(.data = my_df$sheight,.variables =.(gender),.fun = mean) #报错,此处输入的是vector
  6. NULL
  7. > ddply(.data = my_df,.variables =.(gender),summarise,
  8. + mean_h= mean(height)) #.data 输入的是整个dataframe,.(gender)是此包的格式
  9. gender mean_h
  10. 1 F 175.5000
  11. 2 M 181.3333
  12. > ddply(.data = my_df,.variables =.(gender),summarise,
  13. + mean_h= mean(height),sd_h = sd(height)) #同时汇总mean 和sd
  14. gender mean_h sd_h
  15. 1 F 175.5000 0.7071068
  16. 2 M 181.3333 16.2583312
  17. #和tapply比较,输入向量,输出array
  18. > tapply (my_df$height, my_df$gender,mean)
  19. F M
  20. 175.5000 181.3333
  21. > class(tapply (my_df$height, my_df$gender,mean) )
  22. [1] "array"
  1. > library(reshape2)
  2. > head(tips)
  3. total_bill tip sex smoker day time size
  4. 1 16.99 1.01 Female No Sun Dinner 2
  5. 2 10.34 1.66 Male No Sun Dinner 3
  6. 3 21.01 3.50 Male No Sun Dinner 3
  7. 4 23.68 3.31 Male No Sun Dinner 2
  8. 5 24.59 3.61 Female No Sun Dinner 4
  9. 6 25.29 4.71 Male No Sun Dinner 4
  10. > ddply(tips,.(sex,smoker),function(x) sum(x$tip)/sum(x$total_bill))
  11. sex smoker V1
  12. 1 Female No 0.1531892
  13. 2 Female Yes 0.1630623
  14. 3 Male No 0.1573122
  15. 4 Male Yes 0.1369188
  16. >
  17. > ddply(tips,~sex+smoker,function(x) sum(x$tip)/sum(x$total_bill)) #另一种写法
  18. sex smoker V1
  19. 1 Female No 0.1531892
  20. 2 Female Yes 0.1630623
  21. 3 Male No 0.1573122
  22. 4 Male Yes 0.1369188

dlply(iris,~Species,my_mode)适合回归分析,输入数据框,输出list

lapply() 返回list 回归分析返回的是list,所有lapply适合回归分析
dlply适合回归分析,输入数据框,输出list

  1. > my_mode = function(x) lm(Sepal.Length~Sepal.Width,data = x)
  2. > dlply(iris,~Species,my_mode) #iris是数据,~Species是分类变量,my_mode是函数输出的是list
  3. $setosa
  4. Call:
  5. lm(formula = Sepal.Length ~ Sepal.Width, data = x)
  6. Coefficients:
  7. (Intercept) Sepal.Width
  8. 2.6390 0.6905
  9. $versicolor
  10. Call:
  11. lm(formula = Sepal.Length ~ Sepal.Width, data = x)
  12. Coefficients:
  13. (Intercept) Sepal.Width
  14. 3.5397 0.8651
  15. $virginica
  16. Call:
  17. lm(formula = Sepal.Length ~ Sepal.Width, data = x)
  18. Coefficients:
  19. (Intercept) Sepal.Width
  20. 3.9068 0.9015
  21. attr(,"split_type")
  22. [1] "data.frame"
  23. attr(,"split_labels")
  24. Species
  25. 1 setosa
  26. 2 versicolor
  27. 3 virginica

彩蛋

each、colwise、numwise和ddply函数

  1. > each(mean,sd,median)(iris$Sepal.Length) #一次性的对某个对象(第二个括号)进行多种操作
  2. mean sd median
  3. 5.8433333 0.8280661 5.8000000
  4. > colwise(mean)(iris)#对列进行操作
  5. Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  6. 1 5.843333 3.057333 3.758 1.199333 NA
  7. Warning message:
  8. In mean.default(X[[i]], ...) :
  9. argument is not numeric or logical: returning NA
  10. > numcolwise(mean)(iris) #只对为数据的列进行操作
  11. Sepal.Length Sepal.Width Petal.Length Petal.Width
  12. 1 5.843333 3.057333 3.758 1.199333
  13. > #ddply的好处
  14. > #写法一
  15. > ddply(iris,~Species,colwise(mean,c('Sepal.Length','Sepal.Width')))
  16. Species Sepal.Length Sepal.Width
  17. 1 setosa 5.006 3.428
  18. 2 versicolor 5.936 2.770
  19. 3 virginica 6.588 2.974
  20. > #写法二
  21. > ddply(iris,~Species,colwise(mean,.(Sepal.Length,Sepal.Width)))
  22. Species Sepal.Length Sepal.Width
  23. 1 setosa 5.006 3.428
  24. 2 versicolor 5.936 2.770
  25. 3 virginica 6.588 2.974
  26. > #写法三
  27. > ddply(iris,~Species,colwise(mean,~Sepal.Length+Sepal.Width))
  28. Species Sepal.Length Sepal.Width
  29. 1 setosa 5.006 3.428
  30. 2 versicolor 5.936 2.770
  31. 3 virginica 6.588 2.974

参考总结图
image.png