vector,向量,一维
matrix,表格,矩阵,二维, 只能有1种数据类型
data.frame数据框,二维, 每列只能有1种数据类型
list列表,包罗万象

判断依据:1、生成的函数;2、class或is函数

数据框来源

  1. 代码新建
  2. 已有数据转换或处理结果
  3. 读取表格文件
  4. R语言内置数据,eg.LETTERS

    新建数据框

    image.png
    筛选score>0 的行
    image.png
    删除变量
    image.png

image.png

  1. > #重点:数据框
  2. > #1.数据框来源
  3. > # (1)用代码新建
  4. > # (2)由已有数据转换或处理得到
  5. > # (3)读取表格文件
  6. > # (4)R语言内置数据
  7. >
  8. > #2.新建和读取数据框
  9. > df1 <- data.frame(gene = paste0("gene",1:4),
  10. + change = rep(c("up","down"),each = 2),
  11. + score = c(5,3,-2,-4))
  12. > df1
  13. gene change score
  14. 1 gene1 up 5
  15. 2 gene2 up 3
  16. 3 gene3 down -2
  17. 4 gene4 down -4
  18. >
  19. > df2 <- read.csv("gene.csv")
  20. > df2
  21. gene change score
  22. 1 gene1 up 5
  23. 2 gene2 up 3
  24. 3 gene3 down -2
  25. 4 gene4 down -4
  26. >
  27. > #3.数据框属性
  28. > #维度,指行数和列数
  29. > dim(df1)
  30. [1] 4 3
  31. > nrow(df1)#行数
  32. [1] 4
  33. > ncol(df1)#列数
  34. [1] 3
  35. > #行名和列名
  36. > rownames(df1)
  37. [1] "1" "2" "3" "4"
  38. > colnames(df1)
  39. [1] "gene" "change" "score"
  40. >
  41. > #4.数据框取子集
  42. > df1$gene # $一次只能提取1列,不能多列
  43. [1] "gene1" "gene2" "gene3" "gene4"
  44. > df1$score#tab键可协助补齐
  45. [1] 5 3 -2 -4
  46. > mean(df1$score)
  47. [1] 0.5
  48. >
  49. > ## 按坐标
  50. > df1[2,2]
  51. [1] "up"
  52. > df1[2,] #取第二行
  53. gene change score
  54. 2 gene2 up 3
  55. > df1[,2] #取第二列
  56. [1] "up" "up" "down" "down"
  57. > df1[c(1,3),1:2] #第一、三行和第一、二列
  58. gene change
  59. 1 gene1 up
  60. 3 gene3 down
  61. >
  62. > ## 按名字
  63. > df1[,"gene"] #可实现1次提取多列
  64. [1] "gene1" "gene2" "gene3" "gene4"
  65. > df1[,c('gene','change')]
  66. gene change
  67. 1 gene1 up
  68. 2 gene2 up
  69. 3 gene3 down
  70. 4 gene4 down
  71. >
  72. > ## 按条件(逻辑值)
  73. > df1[df1$score>0,]#筛选score>0的行
  74. gene change score
  75. 1 gene1 up 5
  76. 2 gene2 up 3
  77. > #拆分解答
  78. > df1$score
  79. [1] 5 3 -2 -4
  80. > df1$score>0
  81. [1] TRUE TRUE FALSE FALSE
  82. > df1$score[df1$score>0]
  83. [1] 5 3
  84. > df1[df1$score>0,]
  85. gene change score
  86. 1 gene1 up 5
  87. 2 gene2 up 3
  88. >
  89. > df1[df1$score>0,1]#筛选score>0的基因
  90. [1] "gene1" "gene2"
  91. > df1$gene[df1$score>0]#筛选score>0的基因
  92. [1] "gene1" "gene2"
  93. >
  94. > #5.数据框修改
  95. >
  96. > #改一个格
  97. > df1[3,3] <- 5
  98. > df1
  99. gene CHANGE score p.value
  100. r1 gene1 up 12 0.01
  101. r2 gene2 up 23 0.02
  102. r3 gene3 down 5 0.07
  103. r4 gene4 down 2 0.05
  104. > #改一整列
  105. > df1$score <- c(12,23,50,2)
  106. > df1
  107. gene CHANGE score p.value
  108. r1 gene1 up 12 0.01
  109. r2 gene2 up 23 0.02
  110. r3 gene3 down 50 0.07
  111. r4 gene4 down 2 0.05
  112. > #新增一列
  113. > df1$p.value <- c(0.01,0.02,0.07,0.05) #对新的名称而言是新增,对原有的而言是修改
  114. > df1
  115. gene CHANGE score p.value
  116. r1 gene1 up 12 0.01
  117. r2 gene2 up 23 0.02
  118. r3 gene3 down 50 0.07
  119. r4 gene4 down 2 0.05
  120. >
  121. > #改行名和列名
  122. > rownames(df1) <- c("r1","r2","r3","r4")
  123. > #只修改某一行/列的名
  124. > colnames(df1)[2] <- "CHANGE"
  125. >
  126. > #6.两个数据框的连接,merge左连接、右连接、取交集
  127. > test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
  128. + blood_type = c("A","B","O","AB"))
  129. > test1
  130. name blood_type
  131. 1 jimmy A
  132. 2 nicker B
  133. 3 Damon O
  134. 4 Sophie AB
  135. > test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
  136. + group = c("group1","group1","group2","group2"),
  137. + vision = c(4.2,4.3,4.9,4.5))
  138. > test2
  139. name group vision
  140. 1 Damon group1 4.2
  141. 2 jimmy group1 4.3
  142. 3 nicker group2 4.9
  143. 4 tony group2 4.5
  144. >
  145. > test3 <- data.frame(NAME = c('Damon','jimmy','nicker','tony'),
  146. + weight = c(140,145,110,138))
  147. > test3
  148. NAME weight
  149. 1 Damon 140
  150. 2 jimmy 145
  151. 3 nicker 110
  152. 4 tony 138
  153. > merge(test1,test2,by="name")
  154. name blood_type group vision
  155. 1 Damon O group1 4.2
  156. 2 jimmy A group1 4.3
  157. 3 nicker B group2 4.9
  158. > merge(test1,test3,by.x = "name",by.y = "NAME")#用于名称的大小写字母不一致时
  159. name blood_type weight
  160. 1 Damon O 140
  161. 2 jimmy A 145
  162. 3 nicker B 110
  163. > > ##### 矩阵和列表
  164. > m <- matrix(1:9, nrow = 3)
  165. > colnames(m) <- c("a","b","c") #加列名
  166. > m
  167. a b c
  168. [1,] 1 4 7
  169. [2,] 2 5 8
  170. [3,] 3 6 9
  171. > #取子集,不支持$
  172. > m[2,]
  173. a b c
  174. 2 5 8
  175. > m[,1]
  176. [1] 1 2 3
  177. > m[2,3]
  178. c
  179. 8
  180. > m[2:3,1:2]
  181. a b
  182. [1,] 2 5
  183. [2,] 3 6
  184. > m
  185. a b c
  186. [1,] 1 4 7
  187. [2,] 2 5 8
  188. [3,] 3 6 9
  189. > t(m) #转置
  190. [,1] [,2] [,3]
  191. a 1 2 3
  192. b 4 5 6
  193. c 7 8 9
  194. > as.data.frame(m) # 需要再赋值才会变
  195. a b c
  196. 1 1 4 7
  197. 2 2 5 8
  198. 3 3 6 9
  199. >
  200. > #列表,列表的下一级是元素
  201. > l <- list(m1 = matrix(1:9, nrow = 3),
  202. + m2 = matrix(2:9, nrow = 2))
  203. > l
  204. $m1
  205. [,1] [,2] [,3]
  206. [1,] 1 4 7
  207. [2,] 2 5 8
  208. [3,] 3 6 9
  209. $m2
  210. [,1] [,2] [,3] [,4]
  211. [1,] 2 4 6 8
  212. [2,] 3 5 7 9
  213. >
  214. > l[[2]]#取子集,取l列表中的第2个元素
  215. [,1] [,2] [,3] [,4]
  216. [1,] 2 4 6 8
  217. [2,] 3 5 7 9
  218. > l$m1 # 取l列表中的m1元素
  219. [,1] [,2] [,3]
  220. [1,] 1 4 7
  221. [2,] 2 5 8
  222. [3,] 3 6 9
  223. >
  224. > # 补充:元素的名字
  225. > scores = c(100,59,73,95,45)
  226. > names(scores) = c("jimmy","nicker","Damon","Sophie","tony")
  227. > scores
  228. jimmy nicker Damon Sophie tony
  229. 100 59 73 95 45
  230. > scores["jimmy"]
  231. jimmy
  232. 100
  233. > scores[c("jimmy","nicker")]
  234. jimmy nicker
  235. 100 59
  236. >
  237. > names(scores)[scores>60] #选出>60的子集
  238. [1] "jimmy" "Damon" "Sophie"
  239. >
  240. > # 删除变量
  241. > rm(l) #删除一个
  242. > rm(df1,df2) #删除多个
  243. Warning messages:
  244. 1: In rm(df1, df2) : 找不到对象'df1'
  245. 2: In rm(df1, df2) : 找不到对象'df2'
  246. > rm(list = ls()) #删除全部
  247. > #清空控制台: ctrl+l
  248. >
  249. > #调整元素顺序
  250. > x <- c("A","B","C","D","E");x
  251. [1] "A" "B" "C" "D" "E"
  252. > x[c(2,4,1,3,5)]
  253. [1] "B" "D" "A" "C" "E"
  254. >
  255. > scores=c(100,59,73,95,45);scores
  256. [1] 100 59 73 95 45
  257. > scores[c(5,2,3,4,1)]
  258. [1] 45 59 73 95 100
  259. > sort(scores) #另一种方式,从小到大排序
  260. [1] 45 59 73 95 100
  261. > order(scores) #通过order取子集生成的结果等同于sort
  262. [1] 5 2 3 4 1
  263. >
  264. > #向量匹配排序,match
  265. > x <- c("A","B","C","D","E")
  266. > y <- c("B","D","A","C","E")
  267. > match(y,x)#以y为模版、目标、结果,以x为原料,去进行调整顺序所得到的下标
  268. [1] 2 4 1 3 5
  269. > x[match(y,x)]
  270. [1] "B" "D" "A" "C" "E"
  271. >
  272. 练习
  273. > # 练习3-2
  274. > # 1.统计内置数据iris最后一列有哪几个取值,每个取值重复了多少次
  275. > iris[,ncol(iris)]
  276. [1] setosa setosa setosa setosa
  277. [5] setosa setosa setosa setosa
  278. [9] setosa setosa setosa setosa
  279. [13] setosa setosa setosa setosa
  280. [17] setosa setosa setosa setosa
  281. [21] setosa setosa setosa setosa
  282. [25] setosa setosa setosa setosa
  283. [29] setosa setosa setosa setosa
  284. [33] setosa setosa setosa setosa
  285. [37] setosa setosa setosa setosa
  286. [41] setosa setosa setosa setosa
  287. [45] setosa setosa setosa setosa
  288. [49] setosa setosa versicolor versicolor
  289. [53] versicolor versicolor versicolor versicolor
  290. [57] versicolor versicolor versicolor versicolor
  291. [61] versicolor versicolor versicolor versicolor
  292. [65] versicolor versicolor versicolor versicolor
  293. [69] versicolor versicolor versicolor versicolor
  294. [73] versicolor versicolor versicolor versicolor
  295. [77] versicolor versicolor versicolor versicolor
  296. [81] versicolor versicolor versicolor versicolor
  297. [85] versicolor versicolor versicolor versicolor
  298. [89] versicolor versicolor versicolor versicolor
  299. [93] versicolor versicolor versicolor versicolor
  300. [97] versicolor versicolor versicolor versicolor
  301. [101] virginica virginica virginica virginica
  302. [105] virginica virginica virginica virginica
  303. [109] virginica virginica virginica virginica
  304. [113] virginica virginica virginica virginica
  305. [117] virginica virginica virginica virginica
  306. [121] virginica virginica virginica virginica
  307. [125] virginica virginica virginica virginica
  308. [129] virginica virginica virginica virginica
  309. [133] virginica virginica virginica virginica
  310. [137] virginica virginica virginica virginica
  311. [141] virginica virginica virginica virginica
  312. [145] virginica virginica virginica virginica
  313. [149] virginica virginica
  314. Levels: setosa versicolor virginica
  315. > table(iris[,ncol(iris)])
  316. setosa versicolor virginica
  317. 50 50 50
  318. >
  319. > # 2.提取内置数据iris的前5行,前4列,并转换为矩阵,赋值给a。
  320. > iris[1:5,1:4]
  321. Sepal.Length Sepal.Width Petal.Length Petal.Width
  322. 1 5.1 3.5 1.4 0.2
  323. 2 4.9 3.0 1.4 0.2
  324. 3 4.7 3.2 1.3 0.2
  325. 4 4.6 3.1 1.5 0.2
  326. 5 5.0 3.6 1.4 0.2
  327. > a <- as.matrix(iris[1:5,1:4])
  328. > a
  329. Sepal.Length Sepal.Width Petal.Length Petal.Width
  330. 1 5.1 3.5 1.4 0.2
  331. 2 4.9 3.0 1.4 0.2
  332. 3 4.7 3.2 1.3 0.2
  333. 4 4.6 3.1 1.5 0.2
  334. 5 5.0 3.6 1.4 0.2
  335. >
  336. > # 3.将a的行名改为f lower1,flower2...flower5。
  337. > row.names(a) <- paste0("flower",1:5)
  338. > row.names(a) <- paste0("flower",1:nrow(a))
  339. > a
  340. Sepal.Length Sepal.Width Petal.Length
  341. flower1 5.1 3.5 1.4
  342. flower2 4.9 3.0 1.4
  343. flower3 4.7 3.2 1.3
  344. flower4 4.6 3.1 1.5
  345. flower5 5.0 3.6 1.4
  346. Petal.Width
  347. flower1 0.2
  348. flower2 0.2
  349. flower3 0.2
  350. flower4 0.2
  351. flower5 0.2
  352. >
  353. > # 4.探索列表取子集l[2]和l[[2]]的区别(提示:数据结构)
  354. > l <- list(m1 = matrix(1:9, nrow = 3),
  355. + m2 = matrix(2:9, nrow = 2))
  356. > l
  357. $m1
  358. [,1] [,2] [,3]
  359. [1,] 1 4 7
  360. [2,] 2 5 8
  361. [3,] 3 6 9
  362. $m2
  363. [,1] [,2] [,3] [,4]
  364. [1,] 2 4 6 8
  365. [2,] 3 5 7 9
  366. > l[2]
  367. $m2
  368. [,1] [,2] [,3] [,4]
  369. [1,] 2 4 6 8
  370. [2,] 3 5 7 9
  371. > l[[2]]
  372. [,1] [,2] [,3] [,4]
  373. [1,] 2 4 6 8
  374. [2,] 3 5 7 9
  375. > class(l[2])#列表,且列表中只有1个矩阵
  376. [1] "list"
  377. > class(l[[2]])#取子集,不带列表
  378. [1] "matrix" "array"
  379. >
  380. match函数的使用
  381. > load("matchtest.Rdata")
  382. > #a和b是两个内容相同大顺序不同的向量,才用match
  383. > #a
  384. > x$file_name
  385. [1] "708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz"
  386. [2] "95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz"
  387. [3] "90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz"
  388. [4] "587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz"
  389. [5] "1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz"
  390. [6] "09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz"
  391. [7] "44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz"
  392. > #b
  393. > colnames(y)
  394. [1] "90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz"
  395. [2] "587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz"
  396. [3] "95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz"
  397. [4] "09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz"
  398. [5] "708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz"
  399. [6] "44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz"
  400. [7] "1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz"
  401. > #a %in% b,核查是否内容相同
  402. > table(x$file_name) %in% colnames(y)
  403. [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
  404. > table(colnames(y) %in% x$file_name)
  405. TRUE
  406. 7
  407. > #a[match(b,a)]
  408. > m=x$file_name[match(colnames(y),x$file_name)]
  409. > m==colnames(y) #检查前后二者是否一致
  410. [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
  411. > identical(m,colnames(y))#检查前后二者是否一致
  412. [1] TRUE
  413. > #属于x$file_name的下标,也可以给x$ID用,因为对应
  414. > #所以match(colnames(y),x$file_name)也可以给x$ID用
  415. > n = x$ID[match(colnames(y),x$file_name)]
  416. > #11行和16行的两列按照相同下标子集
  417. > #对于本来对应的额,取完子集仍对应;因此m和n对应,13行和colnames(y)对应
  418. > colnames(y)=n
  419. >
  420. > #方法2:调整x行的顺序,让它和colnames(y)对应
  421. > ??
  422. 错误: unexpected input"?"
  423. >
  424. > #方法3:调整y行的顺序,让它和x$file_name对应
  425. > ???
  426. 错误: unexpected input"?"
  427. >