一、单个数据框处理⭐️⭐️⭐️——使用前加载dplyr

1、sort(),order()排序书写太麻烦?试试arrange()吧⭐️⭐️⭐️

  1. > library(dplyr)
  2. > arrange(test, Sepal.Length) #按照Sepal.Length列升序排列
  3. > arrange(test, desc(Sepal.Length))
  4. > arrange(test, desc(Sepal.Width),Sepal.Length)#双排序(若A列相同,再按B列排)
  5. Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  6. 1 5.1 3.5 1.4 0.2 setosa
  7. 2 6.3 3.3 6.0 2.5 virginica
  8. 3 6.4 3.2 4.5 1.5 versicolor
  9. 4 7.0 3.2 4.7 1.4 versicolor
  10. 5 4.9 3.0 1.4 0.2 setosa
  11. 6 5.8 2.7 5.1 1.9 virginica

2、新增列——mutate()

  1. test=mutate(test,new=Sepal.Length*Sepal.Width)

3、筛选行、列——filter(),select()

4、管道符号——避免中间变量过多(%>%),理解为”then”

二、将表达矩阵绘制为箱图⭐️⭐️⭐️⭐️(需将数据预处理)

image.png——————>image.png(宽变长)

  1. set.seed(10086)
  2. exp = matrix(rnorm(18),ncol = 6)
  3. exp = round(exp,2)#保留两位小数
  4. rownames(exp) = paste0("gene",1:3)
  5. colnames(exp) = paste0("test",1:6)
  6. exp[,1:3] = exp[,1:3]+1 #人工处理使差异明显
  7. exp
  8. library(tidyr)
  9. library(tibble)
  10. library(dplyr)
  11. dat = t(exp) %>%
  12. as.data.frame() %>%
  13. rownames_to_column() %>% #行名变列
  14. mutate(group = rep(c("control","treat"),each = 3))
  15. pdat = dat%>%
  16. pivot_longer(cols = starts_with("gene"),#合并哪些列
  17. names_to = "gene",#给新列列名
  18. values_to = "count")#未处理的值放入此列
  19. library(ggplot2)
  20. p = ggplot(pdat,aes(gene,count))+
  21. geom_boxplot(aes(fill = group))+
  22. geom_jitter()+
  23. theme_bw()
  24. p
  25. p + facet_wrap(~gene,scales = "free") #scales = "free"是作图时x,y轴是否要按比例(默认按比例)

image.png

三、缺失值处理⭐️⭐️⭐️——个人认为重要,测序结果有时遇到缺失

  1. > drop_na(X) #把所有有Na值行删除
  2. > drop_na(X,X1)#查看特定列缺失值,并去除该行
  3. #替换NA
  4. > replace_na(X,list(X2=0))
  5. #用上一行的值填充NA
  6. fill(X,X2) #若NA为第一个值则仍为NA

四、多个数据框处理

  1. > test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
  2. + blood_type = c("A","B","O","AB"))
  3. > test1
  4. name blood_type
  5. 1 jimmy A
  6. 2 nicker B
  7. 3 Damon O
  8. 4 Sophie AB
  9. > test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
  10. + group = c("group1","group1","group2","group2"),
  11. + vision = c(4.2,4.3,4.9,4.5))
  12. > test2
  13. name group vision
  14. 1 Damon group1 4.2
  15. 2 jimmy group1 4.3
  16. 3 nicker group2 4.9
  17. 4 tony group2 4.5
  18. > library(dplyr)
  19. > inner_join(test1,test2,by="name")#取交集——与merge存在区别
  20. name blood_type group vision
  21. 1 jimmy A group1 4.3
  22. 2 nicker B group2 4.9
  23. 3 Damon O group1 4.2
  24. > right_join(test1,test2,by="name")#右连接(保留右表存在信息)
  25. name blood_type group vision
  26. 1 jimmy A group1 4.3
  27. 2 nicker B group2 4.9
  28. 3 Damon O group1 4.2
  29. 4 tony <NA> group2 4.5
  30. > full_join(test1,test2,by="name")#全连接(取全集)
  31. name blood_type group vision
  32. 1 jimmy A group1 4.3
  33. 2 nicker B group2 4.9
  34. 3 Damon O group1 4.2
  35. 4 Sophie AB <NA> NA
  36. 5 tony <NA> group2 4.5
  37. > semi_join(test1,test2,by="name")#半连接行(左边取子集,保留右表存在的信息)
  38. name blood_type
  39. 1 jimmy A
  40. 2 nicker B
  41. 3 Damon O
  42. > anti_join(test1,test2,by="name")#反连接
  43. name blood_type
  44. 1 Sophie AB

五、字符串处理⭐️⭐️⭐️⭐️⭐️⭐️——stringr包

  1. library(stringr)
  2. x <- "The birch canoe slid on the smooth planks."
  3. #检测字符串长度
  4. str_length(x) #标点空格都算
  5. [1] 42
  6. #字符串拆分——str_split
  7. x2 = str_split(x," ")[[1]];x2#以空格拆分,提取列表中元素(两个[])
  8. x2 = str_split(x," ",simplify = T)[1,] #simplify将列表简化为矩阵
  9. #字符串连接——str_c
  10. str_c(x2,collapse = " ")#连接
  11. str_c(x2,1234,sep = "+")#外部连接
  12. #提取字符串的一部分
  13. > str_sub(x,5,9)
  14. [1] "birch"
  15. #字符定位
  16. > str_locate(x2,"n")
  17. start end
  18. [1,] NA NA
  19. [2,] NA NA
  20. [3,] 3 3
  21. [4,] NA NA
  22. [5,] 2 2
  23. [6,] NA NA
  24. [7,] NA NA
  25. [8,] 4 4
  26. #字符检测!!!!!
  27. str_detect(x2,"n")#检测是否含有“n”,并返回T/F
  28. [1] FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
  29. #与sum和mean连用,可以统计匹配的个数和比例
  30. sum(str_detect(x2,"n"))
  31. mean(str_detect(x2,"n"))
  32. #字符串替换
  33. str_replace(x2,"o","A")#默认替换每个字符串中匹配第一个
  34. [1] "The" "birch" "canAe" "slid" "An" "the" "smAoth"
  35. [8] "planks."
  36. str_replace_all(x2,"o","A")#替换全部
  37. [1] "The" "birch" "canAe" "slid" "An" "the" "smAAth"
  38. [8] "planks."
  39. #提取匹配到的字符
  40. > str_extract(x2,"o|e")#默认提取每个字符串中匹配第一个,没有返回NA
  41. [1] "e" NA "o" NA "o" "e" "o" NA
  42. > str_extract_all(x2,"o|e")
  43. [[1]]
  44. [1] "e"
  45. [[2]]
  46. character(0)
  47. [[3]]
  48. [1] "o" "e"
  49. [[4]]
  50. character(0)
  51. [[5]]
  52. [1] "o"
  53. [[6]]
  54. [1] "e"
  55. [[7]]
  56. [1] "o" "o"
  57. [[8]]
  58. character(0)
  59. > str_extract_all(x2,"o|e",simplify = T)
  60. [,1] [,2]
  61. [1,] "e" ""
  62. [2,] "" ""
  63. [3,] "o" "e"
  64. [4,] "" ""
  65. [5,] "o" ""
  66. [6,] "e" ""
  67. [7,] "o" "o"
  68. [8,] "" ""
  69. #字符删除
  70. str_remove(x," ")#删除第一个空格字符
  71. str_remove_all(x," ")

六、条件语句与循环语句⭐️⭐️⭐️⭐️⭐️⭐️——类似于C语言

1、if条件语句

(1)与else放一起使用

image.pngimage.png

(2)ifelse

image.png

(3)多个条件

  1. i = 0
  2. if (i>0){
  3. print('+')
  4. } else if (i==0) {
  5. print('0')
  6. } else if (i< 0){
  7. print('-')
  8. }
  9. ifelse(i>0,"+",ifelse((i<0),"-","0"))

2、for循环语句

(1)两种循环方式

image.png

(2)循环结果保存

  1. s = 0
  2. > x <- c(5,6,0,3)
  3. > result = list()
  4. > for(i in 1:length(x)){
  5. + s=s+x[[i]]
  6. + result[[i]] = c(x[[i]],s)
  7. + }
  8. > result
  9. [[1]]
  10. [1] 5 5
  11. [[2]]
  12. [1] 6 11
  13. [[3]]
  14. [1] 0 11
  15. [[4]]
  16. [1] 3 14
  17. > do.call(cbind,result)
  18. [,1] [,2] [,3] [,4]
  19. [1,] 5 6 0 3
  20. [2,] 5 11 11 14

七、矩阵、数据框的隐式循环——apply函数

image.png

八、列表的隐式循环——lapply()、sapply()

  1. #lapply()返回值是列表
  2. lapply(test,mean)
  3. #sapply 简化结果,直接返回矩阵或向量
  4. sapply(test,mean)