filtr 筛选,只对行,不对列,subset(),可以对行和列操作

  1. > head(tips,3)
  2. total_bill tip sex smoker day time size
  3. 1 16.99 1.01 Female No Sun Dinner 2
  4. 2 10.34 1.66 Male No Sun Dinner 3
  5. 3 21.01 3.50 Male No Sun Dinner 3
  6. > fil1 = filter(tips,tips$smoker =='No',tips$day == 'Sun');head(fil1)
  7. total_bill tip sex smoker day time size
  8. 1 16.99 1.01 Female No Sun Dinner 2
  9. 2 10.34 1.66 Male No Sun Dinner 3
  10. 3 21.01 3.50 Male No Sun Dinner 3
  11. 4 23.68 3.31 Male No Sun Dinner 2
  12. 5 24.59 3.61 Female No Sun Dinner 4
  13. 6 25.29 4.71 Male No Sun Dinner 4

slice 切片,行

  1. > s1 = slice(tips,1:5);s1
  2. total_bill tip sex smoker day time size
  3. 1 16.99 1.01 Female No Sun Dinner 2
  4. 2 10.34 1.66 Male No Sun Dinner 3
  5. 3 21.01 3.50 Male No Sun Dinner 3
  6. 4 23.68 3.31 Male No Sun Dinner 2
  7. 5 24.59 3.61 Female No Sun Dinner 4

select ,选择列

  1. > s2 = select(tips,tip,sex,day);head(s2,3)
  2. tip sex day
  3. 1 1.01 Female Sun
  4. 2 1.66 Male Sun
  5. 3 3.50 Male Sun
  6. > s3 = select(tips,tip:time);head(s3,3)
  7. tip sex smoker day time
  8. 1 1.01 Female No Sun Dinner
  9. 2 1.66 Male No Sun Dinner
  10. 3 3.50 Male No Sun Dinner
  11. > s4 = select(tips,2:5);head(s4,3)
  12. tip sex smoker day
  13. 1 1.01 Female No Sun
  14. 2 1.66 Male No Sun
  15. 3 3.50 Male No Sun

arrange()

  1. > new_tips = arrange(tips,total_bill,tip);head(new_tips) #默认升序
  2. total_bill tip sex smoker day time size
  3. 68 3.07 1.00 Female Yes Sat Dinner 1
  4. 93 5.75 1.00 Female Yes Fri Dinner 2
  5. 112 7.25 1.00 Female No Sat Dinner 1
  6. 173 7.25 5.15 Male Yes Sun Dinner 2
  7. 150 7.51 2.00 Male No Thur Lunch 2
  8. 196 7.56 1.44 Male No Thur Lunch 2
  9. > new_tips = arrange(tips,desc(total_bill),tip);head(new_tips) #降序的操作
  10. total_bill tip sex smoker day time size
  11. 171 50.81 10.00 Male Yes Sat Dinner 3
  12. 213 48.33 9.00 Male No Sat Dinner 4
  13. 60 48.27 6.73 Male No Sat Dinner 4
  14. 157 48.17 5.00 Male No Sun Dinner 6
  15. 183 45.35 3.50 Male Yes Sun Dinner 3
  16. 103 44.30 2.50 Female Yes Sat Dinner 3

rename 重新命名 列

  1. > new_tips = rename(tips,bill = total_bill);head(new_tips)
  2. bill tip sex smoker day time size
  3. 1 16.99 1.01 Female No Sun Dinner 2
  4. 2 10.34 1.66 Male No Sun Dinner 3
  5. 3 21.01 3.50 Male No Sun Dinner 3
  6. 4 23.68 3.31 Male No Sun Dinner 2
  7. 5 24.59 3.61 Female No Sun Dinner 4
  8. 6 25.29 4.71 Male No Sun Dinner 4

distinct(data,variable) 将数据集里的列生成因子

  1. > distinct(tips,day)
  2. day
  3. 1 Sun
  4. 20 Sat
  5. 78 Thur
  6. 91 Fri
  7. > distinct(tips,sex)
  8. sex
  9. 1 Female
  10. 2 Male

mutate add new column

  1. > m1 = mutate(tips,rate = tip/total_bill);head(m1)
  2. total_bill tip sex smoker day time size rate
  3. 1 16.99 1.01 Female No Sun Dinner 2 0.05944673
  4. 2 10.34 1.66 Male No Sun Dinner 3 0.16054159
  5. 3 21.01 3.50 Male No Sun Dinner 3 0.16658734
  6. 4 23.68 3.31 Male No Sun Dinner 2 0.13978041
  7. 5 24.59 3.61 Female No Sun Dinner 4 0.14680765
  8. 6 25.29 4.71 Male No Sun Dinner 4 0.18623962
  9. >
  10. > m1 = mutate(tips,rate = tip/total_bill,new_rate = rate*100);head(m1)#可以同时生成ratenew_rate
  11. total_bill tip sex smoker day time size rate
  12. 1 16.99 1.01 Female No Sun Dinner 2 0.05944673
  13. 2 10.34 1.66 Male No Sun Dinner 3 0.16054159
  14. 3 21.01 3.50 Male No Sun Dinner 3 0.16658734
  15. 4 23.68 3.31 Male No Sun Dinner 2 0.13978041
  16. 5 24.59 3.61 Female No Sun Dinner 4 0.14680765
  17. 6 25.29 4.71 Male No Sun Dinner 4 0.18623962
  18. new_rate
  19. 1 5.944673
  20. 2 16.054159
  21. 3 16.658734
  22. 4 13.978041
  23. 5 14.680765
  24. 6 18.623962

transform cannot creat new variable like mutate for example

  1. > transform(tips,rate = tip/total_bill,new_rate = rate*100) # error cannot find rate
  2. Error in eval(substitute(list(...)), `_data`, parent.frame()) :
  3. 找不到对象'rate'

summarise 可以

  1. head(summarize(tips,rate = tip/total_bill) ,3)#
  2. head(summarize(tips,rate = tip/total_bill,new_rate = rate*100) ,3)

sample random choose num

  1. > sample_n(iris,size = 3) #随机抽出行
  2. Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  3. 1 6.5 2.8 4.6 1.5 versicolor
  4. 2 5.1 3.8 1.6 0.2 setosa
  5. 3 6.0 2.9 4.5 1.5 versicolor
  6. > sample_frac(iris,size = 0.01) #按百分比随机抽出行
  7. Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  8. 1 6.2 2.9 4.3 1.3 versicolor
  9. 2 6.8 3.2 5.9 2.3 virginica

%>% pip

  1. > result = tips %>% group_by(smoker,sex) %>% summarise(count = n(),mean_tips = mean(tip),
  2. + sd_bill = sd(total_bill));result
  3. `summarise()` regrouping output by 'smoker' (override with `.groups` argument)
  4. # A tibble: 4 x 5
  5. # Groups: smoker [2]
  6. smoker sex count mean_tips sd_bill
  7. <fct> <fct> <int> <dbl> <dbl>
  8. 1 No Female 54 2.77 7.29
  9. 2 No Male 97 3.11 8.73
  10. 3 Yes Female 33 2.93 9.19
  11. 4 Yes Male 60 3.05 9.91

join join two data.frame

inner_join

semi_join

anit_join

left_join

right_join

  1. > df1 = data.frame(x = letters[1:8],y = 1:8);df1
  2. x y
  3. 1 a 1
  4. 2 b 2
  5. 3 c 3
  6. 4 d 4
  7. 5 e 5
  8. 6 f 6
  9. 7 g 7
  10. 8 h 8
  11. > df2 = data.frame(x = c("a","b","c"),z = 10:12);df2
  12. x z
  13. 1 a 10
  14. 2 b 11
  15. 3 c 12
  16. > inner_join(df1,df2,by = 'x') #jiao ji
  17. x y z
  18. 1 a 1 10
  19. 2 b 2 11
  20. 3 c 3 12
  21. > semi_join(df1,df2,by = "x") #只取df1中在df2中出现的
  22. x y
  23. 1 a 1
  24. 2 b 2
  25. 3 c 3
  26. > anti_join(df1,df2,by = "x") #只取df1不在df2出现的
  27. x y
  28. 1 d 4
  29. 2 e 5
  30. 3 f 6
  31. 4 g 7
  32. 5 h 8
  33. > left_join(df1,df2,by = "x") #按照左侧数据库结合,没有的就自动填充NA
  34. x y z
  35. 1 a 1 10
  36. 2 b 2 11
  37. 3 c 3 12
  38. 4 d 4 NA
  39. 5 e 5 NA
  40. 6 f 6 NA
  41. 7 g 7 NA
  42. 8 h 8 NA
  43. > right_join(df1,df2,by = "x")
  44. x y z
  45. 1 a 1 10
  46. 2 b 2 11
  47. 3 c 3 12