image.png

前期介绍

癌症类型

全部对比或配对样本;样本数量少时差可从别的数据中进行差异?生存?等后续分析

image.png
样本数量
image.png
image.png

差异分析

counts矩阵必定是整数
image.png
优选count数据,不得已才用以下的数据
image.png

在线数据库

xena

image.png
image.png

数据下载

数据下载

下载方式

image.png

image.png

xena

image.png
image.png
image.png

image.png
image.png

  1. proj = "TCGA-CHOL"
  2. if(F){
  3. download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0(proj,".htseq_counts.tsv.gz"))
  4. download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0(proj,".GDC_phenotype.tsv.gz"))
  5. download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".survival.tsv"),destfile = paste0(proj,".survival.tsv"))
  6. }

读取和整理数据

查看生存信息与临床信息

生存信息与临床信息在后面的差异分析里用不到,在此仅查看一下,到生存信息部分再整理

  1. clinical = read.delim(paste0(proj,".GDC_phenotype.tsv.gz"),fill = T,header = T,sep = "\t")
  2. surv = read.delim(paste0(proj,".survival.tsv"),header = T)
  3. clinical[1:4,1:4]
  4. #> submitter_id.samples age_at_initial_pathologic_diagnosis
  5. #> 1 TCGA-ZH-A8Y2-01A 59
  6. #> 2 TCGA-ZH-A8Y7-01A 59
  7. #> 3 TCGA-W7-A93O-01A NA
  8. #> 4 TCGA-W7-A93O-11A NA
  9. #> albumin_result_lower_limit albumin_result_specified_value
  10. #> 1 NA NA
  11. #> 2 3.5 2.4
  12. #> 3 NA NA
  13. #> 4 NA NA
  14. head(surv)
  15. #> sample OS X_PATIENT OS.time
  16. #> 1 TCGA-W5-AA31-01A 0 TCGA-W5-AA31 10
  17. #> 2 TCGA-W5-AA31-11A 0 TCGA-W5-AA31 10
  18. #> 3 TCGA-WD-A7RX-01A 1 TCGA-WD-A7RX 21
  19. #> 4 TCGA-YR-A95A-01A 1 TCGA-YR-A95A 26
  20. #> 5 TCGA-4G-AAZR-11A 1 TCGA-4G-AAZR 28
  21. #> 6 TCGA-4G-AAZR-01A 1 TCGA-4G-AAZR 28

查看矩阵

gdc下载的数据从此处开始衔接

  1. dat = read.table(paste0(proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
  2. #逆转log
  3. dat = as.matrix(2^dat - 1)
  4. dat[1:4,1:4]
  5. #> TCGA-ZD-A8I3-01A TCGA-W5-AA2U-11A TCGA-W5-AA30-01A
  6. #> ENSG00000000003.13 5254 2476 5132
  7. #> ENSG00000000005.5 1 1 0
  8. #> ENSG00000000419.11 1212 655 1644
  9. #> ENSG00000000457.12 753 346 2652
  10. #> TCGA-W5-AA38-01A
  11. #> ENSG00000000003.13 8249
  12. #> ENSG00000000005.5 1
  13. #> ENSG00000000419.11 1696
  14. #> ENSG00000000457.12 519
  15. # 深坑一个
  16. dat[97,9]
  17. #> [1] 876
  18. as.character(dat[97,9]) #眼见不一定为实吧。
  19. #> [1] "875.999999999999"
  20. # 用apply转换为整数矩阵
  21. exp = apply(dat, 2, as.integer)
  22. exp[1:4,1:4] #行名消失
  23. #> TCGA-ZD-A8I3-01A TCGA-W5-AA2U-11A TCGA-W5-AA30-01A TCGA-W5-AA38-01A
  24. #> [1,] 5254 2476 5132 8249
  25. #> [2,] 1 1 0 1
  26. #> [3,] 1211 655 1643 1695
  27. #> [4,] 752 345 2652 519
  28. rownames(exp) = rownames(dat)
  29. exp[1:4,1:4]
  30. #> TCGA-ZD-A8I3-01A TCGA-W5-AA2U-11A TCGA-W5-AA30-01A
  31. #> ENSG00000000003.13 5254 2476 5132
  32. #> ENSG00000000005.5 1 1 0
  33. #> ENSG00000000419.11 1211 655 1643
  34. #> ENSG00000000457.12 752 345 2652
  35. #> TCGA-W5-AA38-01A
  36. #> ENSG00000000003.13 8249
  37. #> ENSG00000000005.5 1
  38. #> ENSG00000000419.11 1695
  39. #> ENSG00000000457.12 519
  40. 3.表达矩阵行名ID转换
  41. library(stringr)
  42. head(rownames(exp))
  43. #> [1] "ENSG00000000003.13" "ENSG00000000005.5" "ENSG00000000419.11"
  44. #> [4] "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"
  45. library(AnnoProbe)
  46. annoGene(rownames(exp),ID_type = "ENSEMBL") # 转换不了
  47. #> Warning in annoGene(rownames(exp), ID_type = "ENSEMBL"): 100% of input IDs are
  48. #> fail to annotate...
  49. #> [1] SYMBOL biotypes ENSEMBL chr start end
  50. #> <0 rows> (or 0-length row.names)
  51. rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
  52. #> [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
  53. #> [5] "ENSG00000000460" "ENSG00000000938"
  54. re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
  55. #> Warning in annoGene(rownames(exp), ID_type = "ENSEMBL"): 6.54% of input IDs are
  56. #> fail to annotate...
  57. #> SYMBOL biotypes ENSEMBL chr start
  58. #> 1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
  59. #> 2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404
  60. #> 3 MIR6859-1 miRNA ENSG00000278267 chr1 17369
  61. #> 4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554
  62. #> 6 FAM138A lncRNA ENSG00000237613 chr1 34554
  63. #> 7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473
  64. #> end
  65. #> 1 14409
  66. #> 2 29570
  67. #> 3 17436
  68. #> 4 31109
  69. #> 6 36081
  70. #> 7 53312
  71. library(tinyarray)
  72. exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
  73. exp[1:4,1:4]
  74. #> TCGA-ZD-A8I3-01A TCGA-W5-AA2U-11A TCGA-W5-AA30-01A TCGA-W5-AA38-01A
  75. #> DDX11L1 0 0 0 1
  76. #> WASH7P 81 10 146 54
  77. #> MIR6859-1 1 0 11 1
  78. #> MIR1302-2HG 0 0 0 0

image.png
image.png

基因过滤

  1. nrow(exp)#过滤之前基因数
  2. #> [1] 56514
  3. #常用过滤标准1:仅去除在所有样本里表达量都为零的基因
  4. exp1 = exp[rowSums(exp)>0,]
  5. nrow(exp1)
  6. #> [1] 48057
  7. #常用过滤标准2(推荐):仅保留在一半以上样本里表达的基因
  8. exp = exp[apply(exp, 1, function(x) sum(x > 0) >0.5*ncol(exp)), ]
  9. nrow(exp)
  10. #> [1] 28434

分组信息获取

  1. table(str_sub(colnames(exp),14,15))
  2. #>
  3. #> 01 11
  4. #> 36 9
  5. Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
  6. Group = factor(Group,levels = c("normal","tumor"))
  7. table(Group)
  8. #> Group
  9. #> normal tumor
  10. #> 9 36

保存数据

  1. save(exp,Group,proj,clinical,surv,file = paste0(proj,".Rdata"))

回顾代码

image.png

  1. x = round(rnorm(10),2)
  2. x
  3. sort(x,decreasing = T)
  4. #生成一个与x内容相同顺序不同的向量
  5. y = sample(x,10)
  6. x
  7. match(x,y) #谁的下标
  8. x[match(x,y)] #错
  9. y[match(x,y)]==x
  10. identical(y[match(x,y)],x) # 两个数据是否一致
  11. names(x)=letters[1:10]
  12. class(x)
  13. library(stringr)
  14. x1 = str_split(sentences[2]," ")[[1]]
  15. ifelse(str_detect(x1,"th"),
  16. "hy",
  17. "no")
  18. library(tibble)
  19. df1 = as.data.frame(x) %>%
  20. rownames_to_column(var = "letters")
  21. library(dplyr)
  22. arrange(df1,desc(x))
  23. df2 = data.frame(letters = letters,
  24. y = rnorm(26))
  25. df2
  26. merge(df1,df2,by = "letters")
  27. a = sample(letters[1:5],7,replace = T)
  28. a
  29. unique(a)
  30. a[!duplicated(a)]==unique(a)
  31. identical(a[!duplicated(a)],unique(a))
  32. x[1]
  33. x[x>1]
  34. x["a"]
  35. df1$y = iris$Sepal.Length[1:10]
  36. df1 = mutate(df1,z = iris$Sepal.Width[1:10])
  37. b = read.delim("TCGA-CHOL.GDC_phenotype.tsv.gz")
  38. load("gtf_gene.Rdata")
  39. save(b,file = "b.Rdata")
  40. pdf("a.pdf")
  41. pheatmap::pheatmap(volcano)
  42. dev.off()
  43. library(eoffice)
  44. f = pheatmap::pheatmap(volcano)
  45. topptx(f,"f.pptx")
  46. install.packages()
  47. BiocManager::install()
  48. devtools::install_github()
  49. library()

image.png

image.png
实现仅保留在一半以上样本里表达的基因的过滤标准2(推荐)

  1. exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ] #矩阵按行取子集,表达矩阵筛选基因
  2. nrow(exp)
  3. #对上述代码的理解,如下:
  4. #sum(逻辑值)代表T的个数,T是表达量大于0
  5. #sum(x>0)是某基因在多少个样本里有表达
  6. sum(x1 > 0) > 22.5
  7. sum(x2 > 0) > 22.5
  8. jianan = function(x) sum(x > 0) > 0.5*ncol(exp)
  9. x1 = as.numeric(exp[1,])
  10. x2 = as.numeric(exp[2,])
  11. jianan(x1)
  12. jianan(x2)
  13. #对矩阵的每一行(基因)做了一个函数的操作
  14. k = apply(exp, 1, jianan)
  15. #矩阵按行取子集,表达矩阵筛选基因
  16. k= apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp));table(k)
  17. exp = exp[k,]
  18. dim(exp)
  19. exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]

image.png
image.png

gdcRNAtools

gdc-client

image.png

image.png
image.png

R markdown

渲染和导出
image.png
image.png
image.png
image.png