一、数据框排序
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL
test
# order 可以给向量排序,也可以给数据框排序。x[order(x)]
sort(test$Sepal.Length) #对某列排序
test$Sepal.Length[order(test$Sepal.Length)] #全部排列
test[order(test$Sepal.Length),] #对所有列进行排序,默认升序
test[order(test$Sepal.Length,decreasing = T),] #对所有列进行降序排列
# arrange,更加灵活的排序,整行移动,不加引号
library(dplyr)
arrange(test, Sepal.Length)
arrange(test, desc(Sepal.Length)) #降序排列
arrange(test, desc(Sepal.Width),Sepal.Length)#先按照A排列,如果列相同,按照B列排列
来自dplyr包的其他函数
#mutate:新增列
mutate(test,new=Sepal.Length*Sepal.Width)
test$new=test$Sepal.Length*test$Sepal.Width#也能新增列
#select()、filter() 筛选行、列
#管道符号 %>%代表向后传递,能规避产生中间变量
x1 = filter(iris,Sepal.Width>3) #筛选行
x2 = select(x1,Sepal.Length,Sepal.Width)#筛选列
x3=arrange(x2,Sepal.Length)
#第二种方法
x = iris %>%
filter(Sepal.Width>3) %>%
select(Sepal.Length,Sepal.Width) %>%
arrange(Sepal.Length)
#第三种方法
arrange(select(filter(iris,Sepal.Width>3),Sepal.Length,Sepal.Width),Sepal.Length)
二、表达矩阵画箱式图
对数据格式进行更改
1、把原有的行名转变为第一列
2、转置T
3、宽变长
# 表达矩阵的代码操作
set.seed(10086) #设定随机种子,保证随机的结果可重复
exp = matrix(rnorm(18),ncol = 6)
exp = round(exp,2) #保留小数点后2位数
rownames(exp) = paste0("gene",1:3)
colnames(exp) = paste0("test",1:6)
exp[,1:3] = exp[,1:3]+1 #前三列加1
exp
library(tidyr)
library(tibble)
library(dplyr)
dat = t(exp) %>% #转制
as.data.frame() %>%
rownames_to_column() %>%
mutate(group = rep(c("control","treat"),each = 3))
pdat = dat%>%
pivot_longer(cols = starts_with("gene"), #pivot,宽变长的函数
names_to = "gene",
values_to = "count")
library(ggplot2)
p = ggplot(pdat,aes(gene,count))+
geom_boxplot(aes(fill = group))+
theme_bw()
p
p + facet_wrap(~gene,scales = "free")#分为3张子图,scales 参数fixed表示固定坐标轴刻度,free表示反馈坐标轴刻度
三、连接
test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
blood_type = c("A","B","O","AB"))
test1
test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
test2
library(dplyr)
inner_join(test1,test2,by="name")#交集inner_join
left_join(test1,test2,by="name")#左连接
right_join(test1,test2,by="name")#右连接
full_join(test1,test2,by="name")#全连接
semi_join(test1,test2,by="name")#半连接
anti_join(test1,test2,by="name")#反连接
总结
# 练习7-1
# 1.加载test1.Rdata,将dat数据框按照logFC从小到大排序
load("test1.Rdata")
library(dplyr)
arrange(dat, logFC)#注意⚠️不加引号
# 2.将test1.Rdata中存放的两个数据框连接在一起,按共同的列取交集
x=merge(dat,ids,by = "probe_id")#第一种方法
library(dplyr)
x2=inner_join(dat,ids,by = "probe_id")#第二种方法
四、字符串
常用函数
字符长度 | str_length(x) |
---|---|
拆分 | str_split( ) |
按位置提取字符 | str_sub( ) |
字符检测 | str_detect( ) |
替换 | str_replace( )/str_replace_all() |
删除 | str_remove()/str_remove_all() |
> rm(list = ls())
> if(!require(stringr))install.packages('stringr')
> library(stringr)
>
> x <- "The birch canoe slid on the smooth planks."
>
> x
[1] "The birch canoe slid on the smooth planks."
> ###1.检测字符串长度
> str_length(x) #共多少个字符
[1] 42
> length(x) #向量中有多少函数
[1] 1
>
> ###2.字符串拆分
> str_split(x," ")#以空格作为拆分
[[1]]
[1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
[8] "planks."
> x2 = str_split(x," ")[[1]];x2
[1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
[8] "planks."
>
> y = c("jimmy 150","nicker 140","tony 152")
> str_split(y," ")
[[1]]
[1] "jimmy" "150"
[[2]]
[1] "nicker" "140"
[[3]]
[1] "tony" "152"
> str_split(y," ",simplify = T)
[,1] [,2]
[1,] "jimmy" "150"
[2,] "nicker" "140"
[3,] "tony" "152"
>
> ###3.按位置提取字符串
> str_sub(x,5,9)
[1] "birch"
>
> ###4.字符检测
> str_detect(x2,"h")#检验是否含有h
[1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
> str_starts(x2,"h")#检验开头是否含有h
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> str_ends(x2,"h")#检验结尾是否含有h
[1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
>
> ###5.字符串替换
> str_replace(x2,"o","A") #只替换一个
[1] "The" "birch" "canAe" "slid" "An" "the" "smAoth"
[8] "planks."
> str_replace_all(x2,"o","A")#全部替换
[1] "The" "birch" "canAe" "slid" "An" "the" "smAAth"
[8] "planks."
>
> ###6.字符删除
> str_remove(x," ")#只删除1个
[1] "Thebirch canoe slid on the smooth planks."
> str_remove_all(x," ")#全部删除
[1] "Thebirchcanoeslidonthesmoothplanks."
>
五、条件语句和循环语句
> rm(list = ls())
>
> ## 一.条件语句
>
> ###1.if(){ }
>
> #### (1)只有if没有else,那么条件是FALSE时就什么都不做
>
> i = -1
> if (i<0) print('up')
[1] "up"
> if (i>0) print('up')
>
> #理解下面代码
> if(!require(tidyr)) install.packages('tidyr')
>
> #### (2)有else,ifelse(x,yes,no)
#3个参数,x逻辑值,yes逻辑值为TRUE的返回值,no逻辑值为FALSE的返回值
> i =1
> if (i>0){
+ print('+')
+ } else {
+ print("-")
+ }
[1] "+"
>
> ifelse(i>0,"+","-")
[1] "+"
>
> x=rnorm(3)
> ifelse(x>0,"+","-")
[1] "-" "+" "+"
>
>
> #### (3)多个条件
> i = 0
> if (i>0){
+ print('+')
+ } else if (i==0) {
+ print('0')
+ } else if (i< 0){
+ print('-')
+ }
[1] "0"
>
> ifelse(i>0,"+",ifelse(i<0,"-","0"))#第二种写法
[1] "0"
>
>
循环语句
## 二、循环语句
>
> ### 1.for循环
> x <- c(5,6,0,3)
> s=0
> for (i in x){
+ s=s+i
+ print(c(i,s))
+ }
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
>
> x <- c(5,6,0,3)
> s = 0
> for (i in 1:length(x)){
+ s=s+x[[i]]
+ print(c(x[[i]],s))
+ }
[1] 5 5
[1] 6 11
[1] 0 11
[1] 3 14
>
> #如何将结果存下来?
> s = 0
> result = list()
> for(i in 1:length(x)){
+ s=s+x[[i]]
+ result[[i]] = c(x[[i]],s)
+ }
> result
[[1]]
[1] 5 5
[[2]]
[1] 6 11
[[3]]
[1] 0 11
[[4]]
[1] 3 14
> do.call(cbind,result)
[,1] [,2] [,3] [,4]
[1,] 5 6 0 3
[2,] 5 11 11 14
>
六、隐式循环
如何挑出一个表达矩阵里方差最大的1000个基因?
a=rnorm(100)
sort(a)
tail(sort(a),10)#排序后,取倒数10个
load("test2.Rdata")
b=apply(test,1,var)#取方差
x=names(tail(sort(b),1000))
head(x)
> rm(list = ls())
> ## apply()族函数
>
> ### 1.apply 处理矩阵或数据框
>
> #apply(X, MARGIN, FUN, …)
> #其中X是数据框/矩阵名;
> #MARGIN为1表示行,为2表示列,FUN是函数
>
> test<- iris[1:6,1:4]
>
> apply(test, 2, mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
4.9500000 3.3833333 1.4500000 0.2333333
>
> apply(test, 1, sum)
1 2 3 4 5 6
10.2 9.5 9.4 9.4 10.2 11.4
>
> ### 2.lapply(list, FUN, …) ,列表的隐式循环
> # 对列表/向量中的每个元素(向量)实施相同的操作
>
> test <- list(x = 36:33,y = 32:35,z = 30:27);test
$x
[1] 36 35 34 33
$y
[1] 32 33 34 35
$z
[1] 30 29 28 27
>
> #返回值是列表,对列表中的每个元素(向量)求均值(试试方差var,分位数quantile)
>
> lapply(test,mean)
$x
[1] 34.5
$y
[1] 33.5
$z
[1] 28.5
> lapply(test,fivenum)
$x
[1] 33.0 33.5 34.5 35.5 36.0
$y
[1] 32.0 32.5 33.5 34.5 35.0
$z
[1] 27.0 27.5 28.5 29.5 30.0
> ### 3.sapply 简化结果,直接返回矩阵或向量
>
> sapply(test,mean)
x y z
34.5 33.5 28.5
> sapply(test,fivenum)
x y z
[1,] 33.0 32.0 27.0
[2,] 33.5 32.5 27.5
[3,] 34.5 33.5 28.5
[4,] 35.5 34.5 29.5
[5,] 36.0 35.0 30.0
>
> class(sapply(test,fivenum))
[1] "matrix" "array"
>
#练习7-2----
# 1.读取group.csv,从第二列中提取圈出来的信息
library(stringr)
a = read.csv("group.csv")
g = str_split(a$title," ",simplify = T)
g
g[,4]
# 2.如何把上一题结果中的Control和Vemurafenib改成全部小写?搜索一下
tolower(g[,4])
str_to_lower(g[,4])
# 3.加载deg.Rdata,根据a、b两列的值,按照以下条件生成向量x:
#a< -1 且b<0.05,则x对应的值为down;
#a>1 且b<0.05,则x对应的值为up;
#其他情况,x对应的值为no
# 统计up、down、no各重复了多少次
load("deg.Rdata")
k1 = deg$a< -1 & deg$b<0.05;table(k1)
k2 = deg$a>1 & deg$b<0.05;table(k2)
x = ifelse(k1,"down",ifelse(k2,"up","no"))
彩蛋:长脚本的管理
第一种方法 利用save和load进行长脚本的管理
第二种方法,用if