filtr 筛选,只对行,不对列,subset(),可以对行和列操作
> head(tips,3)
total_bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3
3 21.01 3.50 Male No Sun Dinner 3
> fil1 = filter(tips,tips$smoker =='No',tips$day == 'Sun');head(fil1)
total_bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3
3 21.01 3.50 Male No Sun Dinner 3
4 23.68 3.31 Male No Sun Dinner 2
5 24.59 3.61 Female No Sun Dinner 4
6 25.29 4.71 Male No Sun Dinner 4
slice 切片,行
> s1 = slice(tips,1:5);s1
total_bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3
3 21.01 3.50 Male No Sun Dinner 3
4 23.68 3.31 Male No Sun Dinner 2
5 24.59 3.61 Female No Sun Dinner 4
select ,选择列
> s2 = select(tips,tip,sex,day);head(s2,3)
tip sex day
1 1.01 Female Sun
2 1.66 Male Sun
3 3.50 Male Sun
> s3 = select(tips,tip:time);head(s3,3)
tip sex smoker day time
1 1.01 Female No Sun Dinner
2 1.66 Male No Sun Dinner
3 3.50 Male No Sun Dinner
> s4 = select(tips,2:5);head(s4,3)
tip sex smoker day
1 1.01 Female No Sun
2 1.66 Male No Sun
3 3.50 Male No Sun
arrange()
> new_tips = arrange(tips,total_bill,tip);head(new_tips) #默认升序
total_bill tip sex smoker day time size
68 3.07 1.00 Female Yes Sat Dinner 1
93 5.75 1.00 Female Yes Fri Dinner 2
112 7.25 1.00 Female No Sat Dinner 1
173 7.25 5.15 Male Yes Sun Dinner 2
150 7.51 2.00 Male No Thur Lunch 2
196 7.56 1.44 Male No Thur Lunch 2
> new_tips = arrange(tips,desc(total_bill),tip);head(new_tips) #降序的操作
total_bill tip sex smoker day time size
171 50.81 10.00 Male Yes Sat Dinner 3
213 48.33 9.00 Male No Sat Dinner 4
60 48.27 6.73 Male No Sat Dinner 4
157 48.17 5.00 Male No Sun Dinner 6
183 45.35 3.50 Male Yes Sun Dinner 3
103 44.30 2.50 Female Yes Sat Dinner 3
rename 重新命名 列
> new_tips = rename(tips,bill = total_bill);head(new_tips)
bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3
3 21.01 3.50 Male No Sun Dinner 3
4 23.68 3.31 Male No Sun Dinner 2
5 24.59 3.61 Female No Sun Dinner 4
6 25.29 4.71 Male No Sun Dinner 4
distinct(data,variable) 将数据集里的列生成因子
> distinct(tips,day)
day
1 Sun
20 Sat
78 Thur
91 Fri
> distinct(tips,sex)
sex
1 Female
2 Male
mutate add new column
> m1 = mutate(tips,rate = tip/total_bill);head(m1)
total_bill tip sex smoker day time size rate
1 16.99 1.01 Female No Sun Dinner 2 0.05944673
2 10.34 1.66 Male No Sun Dinner 3 0.16054159
3 21.01 3.50 Male No Sun Dinner 3 0.16658734
4 23.68 3.31 Male No Sun Dinner 2 0.13978041
5 24.59 3.61 Female No Sun Dinner 4 0.14680765
6 25.29 4.71 Male No Sun Dinner 4 0.18623962
>
> m1 = mutate(tips,rate = tip/total_bill,new_rate = rate*100);head(m1)#可以同时生成rate和new_rate
total_bill tip sex smoker day time size rate
1 16.99 1.01 Female No Sun Dinner 2 0.05944673
2 10.34 1.66 Male No Sun Dinner 3 0.16054159
3 21.01 3.50 Male No Sun Dinner 3 0.16658734
4 23.68 3.31 Male No Sun Dinner 2 0.13978041
5 24.59 3.61 Female No Sun Dinner 4 0.14680765
6 25.29 4.71 Male No Sun Dinner 4 0.18623962
new_rate
1 5.944673
2 16.054159
3 16.658734
4 13.978041
5 14.680765
6 18.623962
transform cannot creat new variable like mutate for example
> transform(tips,rate = tip/total_bill,new_rate = rate*100) # error cannot find rate
Error in eval(substitute(list(...)), `_data`, parent.frame()) :
找不到对象'rate'
summarise 可以
head(summarize(tips,rate = tip/total_bill) ,3)#
head(summarize(tips,rate = tip/total_bill,new_rate = rate*100) ,3)
sample random choose num
> sample_n(iris,size = 3) #随机抽出行
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 6.5 2.8 4.6 1.5 versicolor
2 5.1 3.8 1.6 0.2 setosa
3 6.0 2.9 4.5 1.5 versicolor
> sample_frac(iris,size = 0.01) #按百分比随机抽出行
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 6.2 2.9 4.3 1.3 versicolor
2 6.8 3.2 5.9 2.3 virginica
%>% pip
> result = tips %>% group_by(smoker,sex) %>% summarise(count = n(),mean_tips = mean(tip),
+ sd_bill = sd(total_bill));result
`summarise()` regrouping output by 'smoker' (override with `.groups` argument)
# A tibble: 4 x 5
# Groups: smoker [2]
smoker sex count mean_tips sd_bill
<fct> <fct> <int> <dbl> <dbl>
1 No Female 54 2.77 7.29
2 No Male 97 3.11 8.73
3 Yes Female 33 2.93 9.19
4 Yes Male 60 3.05 9.91
join join two data.frame
inner_join
semi_join
anit_join
left_join
right_join
> df1 = data.frame(x = letters[1:8],y = 1:8);df1
x y
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
8 h 8
> df2 = data.frame(x = c("a","b","c"),z = 10:12);df2
x z
1 a 10
2 b 11
3 c 12
> inner_join(df1,df2,by = 'x') #jiao ji
x y z
1 a 1 10
2 b 2 11
3 c 3 12
> semi_join(df1,df2,by = "x") #只取df1中在df2中出现的
x y
1 a 1
2 b 2
3 c 3
> anti_join(df1,df2,by = "x") #只取df1不在df2出现的
x y
1 d 4
2 e 5
3 f 6
4 g 7
5 h 8
> left_join(df1,df2,by = "x") #按照左侧数据库结合,没有的就自动填充NA
x y z
1 a 1 10
2 b 2 11
3 c 3 12
4 d 4 NA
5 e 5 NA
6 f 6 NA
7 g 7 NA
8 h 8 NA
> right_join(df1,df2,by = "x")
x y z
1 a 1 10
2 b 2 11
3 c 3 12