vector,向量,一维 |
|
matrix,表格,矩阵,二维, |
只能有1种数据类型 |
data.frame数据框,二维, |
每列只能有1种数据类型 |
list列表,包罗万象 |
判断依据:1、生成的函数;2、class或is函数
数据框来源
- 代码新建
- 已有数据转换或处理结果
- 读取表格文件
- R语言内置数据,eg.LETTERS
新建数据框
筛选score>0 的行
删除变量
> #重点:数据框
> #1.数据框来源
> # (1)用代码新建
> # (2)由已有数据转换或处理得到
> # (3)读取表格文件
> # (4)R语言内置数据
>
> #2.新建和读取数据框
> df1 <- data.frame(gene = paste0("gene",1:4),
+ change = rep(c("up","down"),each = 2),
+ score = c(5,3,-2,-4))
> df1
gene change score
1 gene1 up 5
2 gene2 up 3
3 gene3 down -2
4 gene4 down -4
>
> df2 <- read.csv("gene.csv")
> df2
gene change score
1 gene1 up 5
2 gene2 up 3
3 gene3 down -2
4 gene4 down -4
>
> #3.数据框属性
> #维度,指行数和列数
> dim(df1)
[1] 4 3
> nrow(df1)#行数
[1] 4
> ncol(df1)#列数
[1] 3
> #行名和列名
> rownames(df1)
[1] "1" "2" "3" "4"
> colnames(df1)
[1] "gene" "change" "score"
>
> #4.数据框取子集
> df1$gene # $一次只能提取1列,不能多列
[1] "gene1" "gene2" "gene3" "gene4"
> df1$score#tab键可协助补齐
[1] 5 3 -2 -4
> mean(df1$score)
[1] 0.5
>
> ## 按坐标
> df1[2,2]
[1] "up"
> df1[2,] #取第二行
gene change score
2 gene2 up 3
> df1[,2] #取第二列
[1] "up" "up" "down" "down"
> df1[c(1,3),1:2] #第一、三行和第一、二列
gene change
1 gene1 up
3 gene3 down
>
> ## 按名字
> df1[,"gene"] #可实现1次提取多列
[1] "gene1" "gene2" "gene3" "gene4"
> df1[,c('gene','change')]
gene change
1 gene1 up
2 gene2 up
3 gene3 down
4 gene4 down
>
> ## 按条件(逻辑值)
> df1[df1$score>0,]#筛选score>0的行
gene change score
1 gene1 up 5
2 gene2 up 3
> #拆分解答
> df1$score
[1] 5 3 -2 -4
> df1$score>0
[1] TRUE TRUE FALSE FALSE
> df1$score[df1$score>0]
[1] 5 3
> df1[df1$score>0,]
gene change score
1 gene1 up 5
2 gene2 up 3
>
> df1[df1$score>0,1]#筛选score>0的基因
[1] "gene1" "gene2"
> df1$gene[df1$score>0]#筛选score>0的基因
[1] "gene1" "gene2"
>
> #5.数据框修改
>
> #改一个格
> df1[3,3] <- 5
> df1
gene CHANGE score p.value
r1 gene1 up 12 0.01
r2 gene2 up 23 0.02
r3 gene3 down 5 0.07
r4 gene4 down 2 0.05
> #改一整列
> df1$score <- c(12,23,50,2)
> df1
gene CHANGE score p.value
r1 gene1 up 12 0.01
r2 gene2 up 23 0.02
r3 gene3 down 50 0.07
r4 gene4 down 2 0.05
> #新增一列
> df1$p.value <- c(0.01,0.02,0.07,0.05) #对新的名称而言是新增,对原有的而言是修改
> df1
gene CHANGE score p.value
r1 gene1 up 12 0.01
r2 gene2 up 23 0.02
r3 gene3 down 50 0.07
r4 gene4 down 2 0.05
>
> #改行名和列名
> rownames(df1) <- c("r1","r2","r3","r4")
> #只修改某一行/列的名
> colnames(df1)[2] <- "CHANGE"
>
> #6.两个数据框的连接,merge左连接、右连接、取交集
> test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
+ blood_type = c("A","B","O","AB"))
> test1
name blood_type
1 jimmy A
2 nicker B
3 Damon O
4 Sophie AB
> test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
+ group = c("group1","group1","group2","group2"),
+ vision = c(4.2,4.3,4.9,4.5))
> test2
name group vision
1 Damon group1 4.2
2 jimmy group1 4.3
3 nicker group2 4.9
4 tony group2 4.5
>
> test3 <- data.frame(NAME = c('Damon','jimmy','nicker','tony'),
+ weight = c(140,145,110,138))
> test3
NAME weight
1 Damon 140
2 jimmy 145
3 nicker 110
4 tony 138
> merge(test1,test2,by="name")
name blood_type group vision
1 Damon O group1 4.2
2 jimmy A group1 4.3
3 nicker B group2 4.9
> merge(test1,test3,by.x = "name",by.y = "NAME")#用于名称的大小写字母不一致时
name blood_type weight
1 Damon O 140
2 jimmy A 145
3 nicker B 110
> > ##### 矩阵和列表
> m <- matrix(1:9, nrow = 3)
> colnames(m) <- c("a","b","c") #加列名
> m
a b c
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
> #取子集,不支持$
> m[2,]
a b c
2 5 8
> m[,1]
[1] 1 2 3
> m[2,3]
c
8
> m[2:3,1:2]
a b
[1,] 2 5
[2,] 3 6
> m
a b c
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
> t(m) #转置
[,1] [,2] [,3]
a 1 2 3
b 4 5 6
c 7 8 9
> as.data.frame(m) # 需要再赋值才会变
a b c
1 1 4 7
2 2 5 8
3 3 6 9
>
> #列表,列表的下一级是元素
> l <- list(m1 = matrix(1:9, nrow = 3),
+ m2 = matrix(2:9, nrow = 2))
> l
$m1
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
$m2
[,1] [,2] [,3] [,4]
[1,] 2 4 6 8
[2,] 3 5 7 9
>
> l[[2]]#取子集,取l列表中的第2个元素
[,1] [,2] [,3] [,4]
[1,] 2 4 6 8
[2,] 3 5 7 9
> l$m1 # 取l列表中的m1元素
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
>
> # 补充:元素的名字
> scores = c(100,59,73,95,45)
> names(scores) = c("jimmy","nicker","Damon","Sophie","tony")
> scores
jimmy nicker Damon Sophie tony
100 59 73 95 45
> scores["jimmy"]
jimmy
100
> scores[c("jimmy","nicker")]
jimmy nicker
100 59
>
> names(scores)[scores>60] #选出>60的子集
[1] "jimmy" "Damon" "Sophie"
>
> # 删除变量
> rm(l) #删除一个
> rm(df1,df2) #删除多个
Warning messages:
1: In rm(df1, df2) : 找不到对象'df1'
2: In rm(df1, df2) : 找不到对象'df2'
> rm(list = ls()) #删除全部
> #清空控制台: ctrl+l
>
> #调整元素顺序
> x <- c("A","B","C","D","E");x
[1] "A" "B" "C" "D" "E"
> x[c(2,4,1,3,5)]
[1] "B" "D" "A" "C" "E"
>
> scores=c(100,59,73,95,45);scores
[1] 100 59 73 95 45
> scores[c(5,2,3,4,1)]
[1] 45 59 73 95 100
> sort(scores) #另一种方式,从小到大排序
[1] 45 59 73 95 100
> order(scores) #通过order取子集生成的结果等同于sort
[1] 5 2 3 4 1
>
> #向量匹配排序,match
> x <- c("A","B","C","D","E")
> y <- c("B","D","A","C","E")
> match(y,x)#以y为模版、目标、结果,以x为原料,去进行调整顺序所得到的下标
[1] 2 4 1 3 5
> x[match(y,x)]
[1] "B" "D" "A" "C" "E"
>
练习
> # 练习3-2
> # 1.统计内置数据iris最后一列有哪几个取值,每个取值重复了多少次
> iris[,ncol(iris)]
[1] setosa setosa setosa setosa
[5] setosa setosa setosa setosa
[9] setosa setosa setosa setosa
[13] setosa setosa setosa setosa
[17] setosa setosa setosa setosa
[21] setosa setosa setosa setosa
[25] setosa setosa setosa setosa
[29] setosa setosa setosa setosa
[33] setosa setosa setosa setosa
[37] setosa setosa setosa setosa
[41] setosa setosa setosa setosa
[45] setosa setosa setosa setosa
[49] setosa setosa versicolor versicolor
[53] versicolor versicolor versicolor versicolor
[57] versicolor versicolor versicolor versicolor
[61] versicolor versicolor versicolor versicolor
[65] versicolor versicolor versicolor versicolor
[69] versicolor versicolor versicolor versicolor
[73] versicolor versicolor versicolor versicolor
[77] versicolor versicolor versicolor versicolor
[81] versicolor versicolor versicolor versicolor
[85] versicolor versicolor versicolor versicolor
[89] versicolor versicolor versicolor versicolor
[93] versicolor versicolor versicolor versicolor
[97] versicolor versicolor versicolor versicolor
[101] virginica virginica virginica virginica
[105] virginica virginica virginica virginica
[109] virginica virginica virginica virginica
[113] virginica virginica virginica virginica
[117] virginica virginica virginica virginica
[121] virginica virginica virginica virginica
[125] virginica virginica virginica virginica
[129] virginica virginica virginica virginica
[133] virginica virginica virginica virginica
[137] virginica virginica virginica virginica
[141] virginica virginica virginica virginica
[145] virginica virginica virginica virginica
[149] virginica virginica
Levels: setosa versicolor virginica
> table(iris[,ncol(iris)])
setosa versicolor virginica
50 50 50
>
> # 2.提取内置数据iris的前5行,前4列,并转换为矩阵,赋值给a。
> iris[1:5,1:4]
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5.0 3.6 1.4 0.2
> a <- as.matrix(iris[1:5,1:4])
> a
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5.0 3.6 1.4 0.2
>
> # 3.将a的行名改为f lower1,flower2...flower5。
> row.names(a) <- paste0("flower",1:5)
> row.names(a) <- paste0("flower",1:nrow(a))
> a
Sepal.Length Sepal.Width Petal.Length
flower1 5.1 3.5 1.4
flower2 4.9 3.0 1.4
flower3 4.7 3.2 1.3
flower4 4.6 3.1 1.5
flower5 5.0 3.6 1.4
Petal.Width
flower1 0.2
flower2 0.2
flower3 0.2
flower4 0.2
flower5 0.2
>
> # 4.探索列表取子集l[2]和l[[2]]的区别(提示:数据结构)
> l <- list(m1 = matrix(1:9, nrow = 3),
+ m2 = matrix(2:9, nrow = 2))
> l
$m1
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
$m2
[,1] [,2] [,3] [,4]
[1,] 2 4 6 8
[2,] 3 5 7 9
> l[2]
$m2
[,1] [,2] [,3] [,4]
[1,] 2 4 6 8
[2,] 3 5 7 9
> l[[2]]
[,1] [,2] [,3] [,4]
[1,] 2 4 6 8
[2,] 3 5 7 9
> class(l[2])#列表,且列表中只有1个矩阵
[1] "list"
> class(l[[2]])#取子集,不带列表
[1] "matrix" "array"
>
match函数的使用
> load("matchtest.Rdata")
> #a和b是两个内容相同大顺序不同的向量,才用match
> #a
> x$file_name
[1] "708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz"
[2] "95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz"
[3] "90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz"
[4] "587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz"
[5] "1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz"
[6] "09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz"
[7] "44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz"
> #b
> colnames(y)
[1] "90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz"
[2] "587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz"
[3] "95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz"
[4] "09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz"
[5] "708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz"
[6] "44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz"
[7] "1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz"
> #a %in% b,核查是否内容相同
> table(x$file_name) %in% colnames(y)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> table(colnames(y) %in% x$file_name)
TRUE
7
> #a[match(b,a)]
> m=x$file_name[match(colnames(y),x$file_name)]
> m==colnames(y) #检查前后二者是否一致
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
> identical(m,colnames(y))#检查前后二者是否一致
[1] TRUE
> #属于x$file_name的下标,也可以给x$ID用,因为对应
> #所以match(colnames(y),x$file_name)也可以给x$ID用
> n = x$ID[match(colnames(y),x$file_name)]
> #11行和16行的两列按照相同下标子集
> #对于本来对应的额,取完子集仍对应;因此m和n对应,13行和colnames(y)对应
> colnames(y)=n
>
> #方法2:调整x行的顺序,让它和colnames(y)对应
> ??
错误: unexpected input在"?"里
>
> #方法3:调整y行的顺序,让它和x$file_name对应
> ???
错误: unexpected input在"?"里
>