1. 模型可视化
  2. 生信技能树
  3. 1.简版森林图
  4. #https://zhuanlan.zhihu.com/p/109401314
  5. rm(list = ls())
  6. load("../1_data_pre/exp_surv1.Rdata")
  7. rm(surv1)
  8. load("../3_anno_immu/TCGA_idh.Rdata")
  9. library(stringr)
  10. surv1 = idh[,c("event",
  11. "time",
  12. "fp",
  13. "IDH",
  14. "age",
  15. "gender",
  16. "grade",
  17. "Meth",
  18. "1p19q")]
  19. colnames(surv1)[3] = "riskScore"
  20. library(survival)
  21. library(survminer)
  22. model1 = coxph(Surv(time,event)~.,data = surv1)
  23. ggforest(model1,data = surv1)
  24. coxdat = surv1
  25. table(coxdat$IDH)
  26. ##
  27. ## Mutant WT
  28. ## 435 246
  29. coxdat$IDH = ifelse(coxdat$IDH=="WT",1,2)
  30. table(coxdat$gender)
  31. ##
  32. ## female male
  33. ## 267 364
  34. coxdat$gender = ifelse(coxdat$gender=="female",1,2)
  35. table(coxdat$grade)
  36. ##
  37. ## G2 G3 G4
  38. ## 222 242 167
  39. coxdat$grade = ifelse(coxdat$grade=="G2",2,
  40. ifelse(coxdat$grade=="G3",3,4))
  41. table(coxdat$Meth)
  42. ##
  43. ## Methylated Unmethylated
  44. ## 488 165
  45. coxdat$Meth = ifelse(coxdat$Meth=="Unmethylated",1,2)
  46. str(coxdat)
  47. ## 'data.frame': 691 obs. of 9 variables:
  48. ## $ event : int 1 1 0 0 0 1 1 1 1 1 ...
  49. ## $ time : int 441 74 459 463 485 1427 1427 1009 388 759 ...
  50. ## $ riskScore: num 10.9 11.3 10.1 11 11.7 ...
  51. ## $ IDH : num 1 1 2 1 1 1 1 2 1 1 ...
  52. ## $ age : num 78 62 43 53 64 63 63 30 54 49 ...
  53. ## $ gender : num 2 1 2 2 2 1 1 2 2 2 ...
  54. ## $ grade : num 4 4 4 4 4 4 4 4 4 4 ...
  55. ## $ Meth : num 1 1 2 1 1 2 2 2 1 NA ...
  56. ## $ 1p19q : chr "non-codel" "non-codel" "non-codel" "non-codel" ...
  57. coxdat$`1p19q` = ifelse(coxdat$`1p19q`=="non-codel",1,2)
  58. model = coxph(Surv(time,event)~.,data = coxdat)
  59. ggforest(model,data = coxdat)
  60. 2.美化版森林图
  61. 单因素cox
  62. library(tinyarray)
  63. dat1 = surv_cox(t(coxdat[,3:8]),coxdat,continuous = T,pvalue_cutoff = 1)
  64. dat2 = as.data.frame(dat1[,c("HR","HRCILL","HRCIUL","p")])
  65. datp = format(round(dat2[,1:3],3),nsmall = 3)
  66. dat2$Trait = rownames(dat1)
  67. dat2$HR2 = paste0(datp[, 1], "(", datp[, 2], "-", datp[, 3], ")")
  68. str(dat2)
  69. ## 'data.frame': 6 obs. of 6 variables:
  70. ## $ HR : num 2.688 0.115 1.072 1.101 4.776 ...
  71. ## $ HRCILL: num 2.3899 0.0835 1.0602 0.8323 3.7853 ...
  72. ## $ HRCIUL: num 3.023 0.158 1.084 1.456 6.025 ...
  73. ## $ p : num 0 0 0 0.501 0 ...
  74. ## $ Trait : chr "riskScore" "IDH" "age" "gender" ...
  75. ## $ HR2 : chr "2.688(2.390-3.023)" "0.115(0.083-0.158)" "1.072(1.060-1.084)" "1.101(0.832-1.456)" ...
  76. dat2$p = ifelse(dat2$p<0.001,"<0.001",format(round(dat2$p,3),nsmall = 3))
  77. library(forestplot)
  78. forestplot(
  79. dat2[, c(5, 4, 6)],
  80. mean = dat2[, 1],
  81. lower = dat2[, 2],
  82. upper = dat2[, 3],
  83. zero = 1,
  84. boxsize = 0.2,
  85. col = fpColors(box = '#1075BB', lines = 'black', zero = 'grey'),
  86. lty.ci = "solid",
  87. graph.pos = 4
  88. )
  89. 多因素cox
  90. m = summary(model)
  91. p = ifelse(
  92. m$coefficients[, 5] <= 0.001,
  93. "<0.001",
  94. ifelse(
  95. m$coefficients[, 5] <= 0.01,
  96. "<0.01",
  97. ifelse(
  98. m$coefficients[, 5] <= 0.05,
  99. paste(round(m$coefficients[, 5], 3), " *"),
  100. round(m$coefficients[, 5], 3)
  101. )
  102. )
  103. )
  104. #HR和它的置信区间
  105. dat2 = as.data.frame(round(m$conf.int[, c(1, 3, 4)], 2))
  106. dat2 = tibble::rownames_to_column(dat2, var = "Trait")
  107. colnames(dat2)[2:4] = c("HR", "lower", "upper")
  108. #需要在图上显示的HR文字和p值
  109. dat2$HR2 = paste0(dat2[, 2], "(", dat2[, 3], "-", dat2[, 4], ")")
  110. dat2$p = p
  111. str(dat2)
  112. ## 'data.frame': 7 obs. of 6 variables:
  113. ## $ Trait: chr "riskScore" "IDH" "age" "gender" ...
  114. ## $ HR : num 2.43 1.22 1.05 1.29 0.97 0.72 0.68
  115. ## $ lower: num 1.96 0.68 1.03 0.93 0.69 0.5 0.37
  116. ## $ upper: num 3.02 2.19 1.06 1.8 1.35 1.05 1.24
  117. ## $ HR2 : chr "2.43(1.96-3.02)" "1.22(0.68-2.19)" "1.05(1.03-1.06)" "1.29(0.93-1.8)" ...
  118. ## $ p : chr "<0.001" "0.507" "<0.001" "0.129" ...
  119. forestplot(
  120. dat2[, c(1, 5, 6)],
  121. mean = dat2[, 2],
  122. lower = dat2[, 3],
  123. upper = dat2[, 4],
  124. zero = 1,
  125. boxsize = 0.4,
  126. col = fpColors(box = '#1075BB', lines = 'black', zero = 'grey'),
  127. lty.ci = "solid",
  128. graph.pos = 4
  129. )
  130. 3.诺模图
  131. library(rms)
  132. dd<-datadist(surv1)
  133. options(datadist="dd")
  134. mod <- cph(as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
  135. data=surv1,x=T,y=T,surv = T)
  136. surv<-Survival(mod)
  137. m1<-function(x) surv(365,x)
  138. m3<-function(x) surv(1095,x)
  139. m5<-function(x) surv(1825,x)
  140. x<-nomogram(mod,
  141. fun = list(m1,m3,m5),
  142. funlabel = c('1-y survival',
  143. '3-y survival',
  144. '5-y survival'),
  145. lp = F)
  146. plot(x)
  147. 奇怪的结果
  148. sfit1 = survival::survfit(survival::Surv(time, event) ~
  149. grade, data = surv1)
  150. survminer::ggsurvplot(sfit1, pval = TRUE, palette = "jco", data = surv1, legend = c(0.8, 0.8))
  151. 对此的进一步探索:https://www.yuque.com/xiaojiewanglezenmofenshen/bsgk2d/ci7604
  152. f1 <- cph(formula = as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
  153. data=surv1,x=T,y=T,surv = T, time.inc=365)
  154. cal1 <- calibrate(f1, cmethod="KM", method="boot", u=365, m=50, B=1000)
  155. ## Using Cox survival estimates at 365 Days
  156. f3 <- cph(formula = as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
  157. data=surv1,x=T,y=T,surv = T, time.inc=1095)
  158. cal3 <- calibrate(f3, cmethod="KM", method="boot", u=1095, m=50, B=1000)
  159. ## Using Cox survival estimates at 1095 Days
  160. f5 <- cph(formula = as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
  161. data=surv1,x=T,y=T,surv = T, time.inc=1825)
  162. cal5 <- calibrate(f5, cmethod="KM", method="boot", u=1825, m=50, B=1000)
  163. ## Using Cox survival estimates at 1825 Days
  164. plot(cal1,lwd = 2,lty = 0,errbar.col = c("#92C5DE"),
  165. bty = "l", #只画左边和下边框
  166. xlim = c(0,1),ylim= c(0,1),
  167. xlab = "Nomogram-prediced OS (%)",ylab = "Observed OS (%)",
  168. col = c("#92C5DE"),
  169. cex.lab=1.2,cex.axis=1, cex.main=1.2, cex.sub=0.6)
  170. lines(cal3[,c('mean.predicted',"KM")],
  171. type = 'b', lwd = 2, col = c("#92C5DE"), pch = 16)
  172. mtext("")
  173. plot(cal3,lwd = 2,lty = 0,errbar.col = c("#F4A582"),
  174. bty = "l", #只画左边和下边框
  175. xlim = c(0,1),ylim= c(0,1),
  176. xlab = "Nomogram-prediced OS (%)",ylab = "Observed OS (%)",
  177. col = c("#F4A582"),
  178. cex.lab=1.2,cex.axis=1, cex.main=1.2, cex.sub=0.6)
  179. lines(cal3[,c('mean.predicted',"KM")],
  180. type = 'b', lwd = 2, col = c("#F4A582"), pch = 16)
  181. mtext("")
  182. plot(cal5,lwd = 2,lty = 0,errbar.col = c("#66C2A5"),
  183. xlim = c(0,1),ylim= c(0,1),col = c("#66C2A5"),add = T)
  184. lines(cal5[,c('mean.predicted',"KM")],
  185. type = 'b', lwd = 2, col = c("#66C2A5"), pch = 16)
  186. abline(0,1, lwd = 2, lty = 3, col = c("#224444"))
  187. legend("bottomright", #图例的位置
  188. legend = c("1-y survival","3-y survival","5-y survival"), #图例文字
  189. col = c("#92C5DE", "#F4A582", "#66C2A5"), #图例线的颜色,与文字对应
  190. lwd = 2,#图例中线的粗细
  191. cex = 1.2)
  192. 4.DCA曲线
  193. #devtools::install_github('yikeshu0611/ggDCA')
  194. dat = surv1
  195. dat$nomo = predict(model1,newdata = dat)
  196. library(ggDCA)
  197. library(rms)
  198. nomo <- cph(Surv(time,event)~nomo,dat)
  199. IDH <- cph(Surv(time,event)~IDH,dat)
  200. Grade <- cph(Surv(time,event)~grade,dat)
  201. data <- dca(nomo,IDH,Grade)
  202. ggplot(data)

文献1 肺癌

该文献问题比较多,注意入坑
image.png
配对样本的数据,注意配色、图标
image.png
突变分析
image.png
image.png
image.png

文献2 黑色素瘤

image.png
image.png
image.png
寻找上述资料
xuna
image.png

image.png
image.png
其他方法计算肿瘤纯度
image.png
image.png
image.png
免疫浸润:Timer 或 cibersort 或 ssgsea
image.png

文献3 胶质瘤

图好看,但逻辑有些问题
image.png
image.png

  1. step1_数据收集
  2. 1.数据整理的说明
  3. 1.1 下载的数据
  4. 四个数据的表达矩阵、生存信息表格。
  5. TCGALGGGBM是分开的。后续用代码合并到一起。
  6. 1.2 表达矩阵的要求
  7. (1)就要是取过log、没有异常值的矩阵,标准化过的也可以,因为不做差异分析。
  8. (2)如果是转录组数据,要log之后的标准化数据。(因为这里只做生存分析,不做差异分析,所以没有必要找count数据)
  9. (3)行名都转换成gene symbol
  10. (4)只要tumor样本,不要normal
  11. 1.3 临床信息的要求
  12. (1)要有每个样本对应的生存时间和结局事件,并用代码调整顺序,与表达矩阵的样本顺序相同,严格一一对应。
  13. (2)为了后续代码统一,把生存时间和结局事件列名统一起来,都叫“time”和“event”。
  14. (3)至于时间的单位,反正是分别计算的,统不统一无所谓。
  15. 1.4 关于数据来源的说明
  16. 数据来源于哪个数据库,不是主要矛盾。重点是分清楚是芯片数据还是转录组数据,是芯片的话要能找到探针注释。如果是转录组数据,则需要分清楚是否标准化,进行了哪种标准化。反正是分别分析,又不需要合并,所以直接使用即可。
  17. 2.TCGARNA-seq数据
  18. Table S1显示样本数量691TCGAGBM数据173样本,LGG529样本。
  19. 2.1 TCGA表达矩阵
  20. exp_gbm = read.table("TCGA-GBM.htseq_fpkm.tsv.gz",
  21. header = T,row.names = 1,check.names = F)
  22. exp_lgg = read.table("TCGA-LGG.htseq_fpkm.tsv.gz",
  23. header = T,row.names = 1,check.names = F)
  24. exp1 = as.matrix(cbind(exp_gbm,exp_lgg))
  25. #行名转换为gene symbol
  26. library(stringr)
  27. library(AnnoProbe)
  28. rownames(exp1) = str_split(rownames(exp1),"\\.",simplify = T)[,1];head(rownames(exp1))
  29. ## [1] "ENSG00000242268" "ENSG00000270112" "ENSG00000167578" "ENSG00000273842"
  30. ## [5] "ENSG00000078237" "ENSG00000146083"
  31. re = annoGene(rownames(exp1),ID_type = "ENSEMBL");head(re)
  32. ## SYMBOL biotypes ENSEMBL chr start
  33. ## 1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
  34. ## 2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404
  35. ## 3 MIR6859-1 miRNA ENSG00000278267 chr1 17369
  36. ## 4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554
  37. ## 6 FAM138A lncRNA ENSG00000237613 chr1 34554
  38. ## 7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473
  39. ## end
  40. ## 1 14409
  41. ## 2 29570
  42. ## 3 17436
  43. ## 4 31109
  44. ## 6 36081
  45. ## 7 53312
  46. library(tinyarray)
  47. exp1 = trans_array(exp1,ids = re,from = "ENSEMBL",to = "SYMBOL")
  48. exp1[1:4,1:4]
  49. ## TCGA-06-0878-01A TCGA-26-5135-01A TCGA-06-5859-01A TCGA-06-2563-01A
  50. ## DDX11L1 0.0000000 0.0221755 0.00000000 0.01797799
  51. ## WASH7P 0.6673943 1.7429940 0.65334641 1.57566444
  52. ## MIR6859-1 1.2006708 2.1880585 1.51229174 2.17610561
  53. ## MIR1302-2HG 0.0000000 0.0000000 0.02520857 0.00000000
  54. dim(exp1)
  55. ## [1] 56514 702
  56. #删除正常样本
  57. Group = make_tcga_group(exp1)
  58. table(Group)
  59. ## Group
  60. ## normal tumor
  61. ## 5 697
  62. exp1 = exp1[,Group == "tumor"]
  63. #过滤表达量0值太多的基因
  64. exp1 = exp1[apply(exp1, 1, function(x){sum(x>0)>200}),]
  65. 2.2 TCGA生存信息
  66. surv_gbm = readr::read_tsv("TCGA-GBM.survival.tsv")
  67. surv_gbm$TYPE = "GBM"
  68. surv_lgg = readr::read_tsv("TCGA-LGG.survival.tsv")
  69. surv_lgg$TYPE = "LGG"
  70. surv1 = rbind(surv_gbm,surv_lgg)
  71. head(surv1)
  72. ## # A tibble: 6 x 5
  73. ## sample OS `_PATIENT` OS.time TYPE
  74. ## <chr> <dbl> <chr> <dbl> <chr>
  75. ## 1 TCGA-12-0657-01A 1 TCGA-12-0657 3 GBM
  76. ## 2 TCGA-32-1977-01A 0 TCGA-32-1977 3 GBM
  77. ## 3 TCGA-19-1791-01A 0 TCGA-19-1791 4 GBM
  78. ## 4 TCGA-28-1757-01A 0 TCGA-28-1757 4 GBM
  79. ## 5 TCGA-19-2624-01A 1 TCGA-19-2624 5 GBM
  80. ## 6 TCGA-41-4097-01A 1 TCGA-41-4097 6 GBM
  81. table(colnames(exp1) %in% surv1$sample)
  82. ##
  83. ## FALSE TRUE
  84. ## 6 691
  85. s = intersect(colnames(exp1),surv1$sample)
  86. exp1 = exp1[,s]
  87. surv1 = surv1[match(s,surv1$sample),]
  88. colnames(surv1)[c(2,4)] = c("event","time")
  89. exp1[1:4,1:4]
  90. ## TCGA-06-0878-01A TCGA-26-5135-01A TCGA-06-5859-01A TCGA-06-2563-01A
  91. ## WASH7P 0.66739425 1.74299400 0.65334641 1.57566444
  92. ## MIR6859-1 1.20067078 2.18805848 1.51229174 2.17610561
  93. ## AL627309.1 0.00000000 0.02066275 0.01386984 0.00000000
  94. ## CICP27 0.09691981 0.01013527 0.00000000 0.02449211
  95. head(surv1)
  96. ## # A tibble: 6 x 5
  97. ## sample event `_PATIENT` time TYPE
  98. ## <chr> <dbl> <chr> <dbl> <chr>
  99. ## 1 TCGA-06-0878-01A 0 TCGA-06-0878 218 GBM
  100. ## 2 TCGA-26-5135-01A 1 TCGA-26-5135 270 GBM
  101. ## 3 TCGA-06-5859-01A 0 TCGA-06-5859 139 GBM
  102. ## 4 TCGA-06-2563-01A 0 TCGA-06-2563 932 GBM
  103. ## 5 TCGA-41-2571-01A 1 TCGA-41-2571 26 GBM
  104. ## 6 TCGA-28-5207-01A 1 TCGA-28-5207 343 GBM
  105. identical(colnames(exp1),surv1$sample)
  106. ## [1] TRUE
  107. 691个有生存信息的tumor样本。
  108. 3.CGGA芯片数据整理
  109. exp2 = read.table("CGGA/CGGA.mRNA_array_301_gene_level.20200506.txt",header = T,row.names = 1)
  110. surv2 = read.table("CGGA/CGGA.mRNA_array_301_clinical.20200506.txt",sep = "\t",header = T,check.names = F)
  111. head(surv2)
  112. ## CGGA_ID TCGA_subtypes PRS_type Histology Grade Gender Age OS
  113. ## 1 CGGA_11 Classical Primary GBM WHO IV Female 57 155
  114. ## 2 CGGA_124 Mesenchymal Primary GBM WHO IV Male 53 414
  115. ## 3 CGGA_126 Mesenchymal Primary GBM WHO IV Female 50 1177
  116. ## 4 CGGA_168 Mesenchymal Primary GBM WHO IV Male 17 3086
  117. ## 5 CGGA_172 Mesenchymal Primary GBM WHO IV Female 57 462
  118. ## 6 CGGA_195 Mesenchymal Primary GBM WHO IV Male 48 486
  119. ## Censor (alive=0; dead=1) Radio_status (treated=1;un-treated=0)
  120. ## 1 1 1
  121. ## 2 1 1
  122. ## 3 1 1
  123. ## 4 0 1
  124. ## 5 1 1
  125. ## 6 1 1
  126. ## Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
  127. ## 1 0 Wildtype
  128. ## 2 0 Wildtype
  129. ## 3 1 Wildtype
  130. ## 4 1 Wildtype
  131. ## 5 0 Wildtype
  132. ## 6 1 Wildtype
  133. ## 1p19q_Codeletion_status MGMTp_methylation_status
  134. ## 1 <NA> un-methylated
  135. ## 2 <NA> un-methylated
  136. ## 3 <NA> un-methylated
  137. ## 4 <NA> un-methylated
  138. ## 5 <NA> un-methylated
  139. ## 6 <NA> un-methylated
  140. s = intersect(surv2$CGGA_ID,colnames(exp2))
  141. exp2 = as.matrix(exp2[,s])
  142. surv2 = surv2[match(s,surv2$CGGA_ID),]
  143. colnames(surv2)[c(9,8)] = c("event","time")
  144. exp2[1:4,1:4]
  145. ## CGGA_11 CGGA_124 CGGA_126 CGGA_168
  146. ## A1BG -0.3603635 0.5649519 0.3047719 0.1749144
  147. ## A1CF 2.3413600 1.1707935 2.2081513 0.9652286
  148. ## A2BP1 0.0345194 1.0964074 0.3024883 0.0949111
  149. ## A2LD1 -0.9130564 0.5108800 0.4253669 -0.1949377
  150. head(surv2)
  151. ## CGGA_ID TCGA_subtypes PRS_type Histology Grade Gender Age time event
  152. ## 1 CGGA_11 Classical Primary GBM WHO IV Female 57 155 1
  153. ## 2 CGGA_124 Mesenchymal Primary GBM WHO IV Male 53 414 1
  154. ## 3 CGGA_126 Mesenchymal Primary GBM WHO IV Female 50 1177 1
  155. ## 4 CGGA_168 Mesenchymal Primary GBM WHO IV Male 17 3086 0
  156. ## 5 CGGA_172 Mesenchymal Primary GBM WHO IV Female 57 462 1
  157. ## 6 CGGA_195 Mesenchymal Primary GBM WHO IV Male 48 486 1
  158. ## Radio_status (treated=1;un-treated=0)
  159. ## 1 1
  160. ## 2 1
  161. ## 3 1
  162. ## 4 1
  163. ## 5 1
  164. ## 6 1
  165. ## Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
  166. ## 1 0 Wildtype
  167. ## 2 0 Wildtype
  168. ## 3 1 Wildtype
  169. ## 4 1 Wildtype
  170. ## 5 0 Wildtype
  171. ## 6 1 Wildtype
  172. ## 1p19q_Codeletion_status MGMTp_methylation_status
  173. ## 1 <NA> un-methylated
  174. ## 2 <NA> un-methylated
  175. ## 3 <NA> un-methylated
  176. ## 4 <NA> un-methylated
  177. ## 5 <NA> un-methylated
  178. ## 6 <NA> un-methylated
  179. identical(surv2$CGGA_ID,colnames(exp2))
  180. ## [1] TRUE
  181. 4.CGGA转录组数据整理
  182. exp3 = read.table("CGGA/CGGA.mRNAseq_325.RSEM-genes.20200506.txt",header = T,row.names = 1)
  183. exp3 = as.matrix(log2(exp3+1))
  184. surv3 = read.table("CGGA/CGGA.mRNAseq_325_clinical.20200506.txt",sep = "\t",header = T,check.names = F)
  185. head(surv3)
  186. ## CGGA_ID PRS_type Histology Grade Gender Age OS
  187. ## 1 CGGA_1001 Primary GBM WHO IV Male 11 3817
  188. ## 2 CGGA_1006 Primary AA WHO III Male 42 254
  189. ## 3 CGGA_1007 Primary GBM WHO IV Female 57 345
  190. ## 4 CGGA_1011 Primary GBM WHO IV Female 46 109
  191. ## 5 CGGA_1015 Primary GBM WHO IV Male 62 164
  192. ## 6 CGGA_1019 Recurrent rGBM WHO IV Male 60 212
  193. ## Censor (alive=0; dead=1) Radio_status (treated=1;un-treated=0)
  194. ## 1 0 0
  195. ## 2 1 1
  196. ## 3 1 1
  197. ## 4 1 1
  198. ## 5 1 1
  199. ## 6 1 0
  200. ## Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
  201. ## 1 1 Wildtype
  202. ## 2 1 Wildtype
  203. ## 3 1 Wildtype
  204. ## 4 0 Wildtype
  205. ## 5 0 Wildtype
  206. ## 6 1 Wildtype
  207. ## 1p19q_codeletion_status MGMTp_methylation_status
  208. ## 1 Non-codel un-methylated
  209. ## 2 Non-codel un-methylated
  210. ## 3 Non-codel un-methylated
  211. ## 4 Non-codel un-methylated
  212. ## 5 Non-codel un-methylated
  213. ## 6 Non-codel methylated
  214. colnames(surv3)[c(8,7)] = c("event","time")
  215. s = intersect(surv3$CGGA_ID,colnames(exp3))
  216. exp3 = exp3[,s]
  217. surv3 = surv3[match(s,surv3$CGGA_ID),]
  218. exp3[1:4,1:4]
  219. ## CGGA_1001 CGGA_1006 CGGA_1007 CGGA_1011
  220. ## A1BG 3.769772 3.0054000 4.958379 2.933573
  221. ## A1BG-AS1 1.641546 1.0908534 2.933573 2.411426
  222. ## A2M 8.826294 6.7487296 7.698357 9.467952
  223. ## A2M-AS1 2.104337 0.1763228 0.704872 1.384050
  224. head(surv3)
  225. ## CGGA_ID PRS_type Histology Grade Gender Age time event
  226. ## 1 CGGA_1001 Primary GBM WHO IV Male 11 3817 0
  227. ## 2 CGGA_1006 Primary AA WHO III Male 42 254 1
  228. ## 3 CGGA_1007 Primary GBM WHO IV Female 57 345 1
  229. ## 4 CGGA_1011 Primary GBM WHO IV Female 46 109 1
  230. ## 5 CGGA_1015 Primary GBM WHO IV Male 62 164 1
  231. ## 6 CGGA_1019 Recurrent rGBM WHO IV Male 60 212 1
  232. ## Radio_status (treated=1;un-treated=0)
  233. ## 1 0
  234. ## 2 1
  235. ## 3 1
  236. ## 4 1
  237. ## 5 1
  238. ## 6 0
  239. ## Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
  240. ## 1 1 Wildtype
  241. ## 2 1 Wildtype
  242. ## 3 1 Wildtype
  243. ## 4 0 Wildtype
  244. ## 5 0 Wildtype
  245. ## 6 1 Wildtype
  246. ## 1p19q_codeletion_status MGMTp_methylation_status
  247. ## 1 Non-codel un-methylated
  248. ## 2 Non-codel un-methylated
  249. ## 3 Non-codel un-methylated
  250. ## 4 Non-codel un-methylated
  251. ## 5 Non-codel un-methylated
  252. ## 6 Non-codel methylated
  253. identical(surv3$CGGA_ID,colnames(exp3))
  254. ## [1] TRUE
  255. 我没仔细看这个表达矩阵具体是哪种标准化,反正这个数据只是作为例子,我们就不深究到底是啥了,直接拿来用。注意如果是自己要用来发文章的数据,是要看清楚的,没标准化需要自行标准化,cpmtmpfpkm都行。
  256. 5.GSE16011的整理
  257. library(tinyarray)
  258. geo = geo_download("GSE16011")
  259. library(stringr)
  260. #只要tumor样本
  261. k = str_detect(geo$pd$title,"glioma");table(k)
  262. ## k
  263. ## FALSE TRUE
  264. ## 8 276
  265. geo$exp = geo$exp[,k]
  266. geo$pd = geo$pd[k,]
  267. # 把行名转换为gene symbol
  268. gpl = GEOquery::getGEO(filename = "GPL8542_family.soft.gz",destdir = ".")
  269. ids = gpl@dataTable@table[,1:2]
  270. library(clusterProfiler)
  271. library(org.Hs.eg.db)
  272. e2s = bitr(ids$ORF,fromType = "ENTREZID",toType = "SYMBOL",OrgDb = "org.Hs.eg.db")
  273. ids = merge(ids,e2s,by.x = "ORF",by.y = "ENTREZID")
  274. ids = ids[,2:3]
  275. colnames(ids) = c("probe_id","symbol")
  276. exp4 = trans_array(geo$exp,ids)
  277. surv4 = geo$pd[,c(1,4,7,9)]
  278. exp4[1:4,1:4]
  279. ## GSM405201 GSM405202 GSM405203 GSM405204
  280. ## A1BG 7.219401 6.031261 6.707681 6.330666
  281. ## A2M 12.206626 13.260587 12.352693 13.044375
  282. ## NAT1 5.182989 7.213272 6.570733 6.343490
  283. ## NAT2 5.239842 5.260373 5.697435 4.859682
  284. head(surv4)
  285. ## title age at diagnosis:ch1 histology:ch1 gender:ch1
  286. ## GSM405201 glioma 8 44.57 OD (grade III) Female
  287. ## GSM405202 glioma 9 28.69 OD (grade III) Male
  288. ## GSM405203 glioma 11 38.58 OD (grade III) Male
  289. ## GSM405204 glioma 13 33.89 OD (grade III) Male
  290. ## GSM405205 glioma 20 48.03 OD (grade III) Male
  291. ## GSM405206 glioma 21 31.53 OD (grade III) Male
  292. identical(rownames(surv4),colnames(exp4))
  293. ## [1] TRUE
  294. 临床信息表格里没有生存信息,所以从文章附件里找
  295. 从文章附件里提取临床信息表格
  296. https://aacrjournals.org/cancerres/article/69/23/9065/553005/Intrinsic-Gene-Expression-Profiles-of-Gliomas-Are
  297. 包非常难装,需要配置java,但这个需求不常用,直接跳过这个过程。
  298. if(!file.exists("exp_surv4.Rdata")){
  299. library(tabulizer)
  300. f <- "00085472can092307-sup-stabs_1-6.pdf"
  301. re <- extract_tables(f,pages = 1:10) #提取前10页的表格。
  302. str(re)
  303. re = do.call(rbind,re)
  304. re[1:4,1:4]
  305. colnames(re) = re[1,]
  306. re = re[-1,]
  307. re = data.frame(re)
  308. re[re==""]=NA
  309. library(readr)
  310. re$Survival..years. = parse_double(re$Survival..years.,locale = locale(decimal_mark = ","))
  311. re$Age.at.diagnosis = parse_double(re$Age.at.diagnosis,locale = locale(decimal_mark = ","))
  312. dim(exp4)
  313. k = re$Reviewed.histological.diagnosis!="control";table(k)
  314. re = re[k,]
  315. re$Database.number = paste("glioma",re$Database.number)
  316. surv4$ID = rownames(surv4)
  317. surv4 = merge(surv4,re,by.x = "title",by.y = "Database.number")
  318. colnames(surv4)[13:14] = c("event","time")
  319. table(surv4$event)
  320. surv4$time = as.integer(surv4$time*365) #天为单位
  321. surv4$event[surv4$event=="Lost to\rfollow up"]=NA
  322. table(surv4$event)
  323. surv4$event=ifelse(surv4$event=="Alive",0,1)
  324. head(surv4)
  325. s = intersect(surv4$ID,colnames(exp4))
  326. exp4 = exp4[,s]
  327. surv4 = surv4[match(s,surv4$ID),]
  328. save(exp4,surv4,file = "exp_surv4.Rdata")
  329. }
  330. load("exp_surv4.Rdata")
  331. exp4[1:4,1:4]
  332. ## GSM405234 GSM405235 GSM405236 GSM405237
  333. ## A1BG 7.613150 6.995666 7.337393 6.287498
  334. ## A2M 13.166457 13.003225 13.814942 12.991301
  335. ## NAT1 5.581672 7.470894 7.845210 6.088651
  336. ## NAT2 5.206464 4.618245 4.881652 5.011908
  337. head(surv4)
  338. ## title age at diagnosis:ch1 histology:ch1 gender:ch1 ID Gender
  339. ## 1 glioma 101 57.68 GBM (grade IV) Female GSM405234 Female
  340. ## 2 glioma 104 71.09 GBM (grade IV) Male GSM405235 Male
  341. ## 3 glioma 105 52.20 GBM (grade IV) Male GSM405236 Male
  342. ## 4 glioma 107 51.17 GBM (grade IV) Male GSM405237 Male
  343. ## 5 glioma 11 38.58 OD (grade III) Male GSM405203 Male
  344. ## 6 glioma 111 37.25 GBM (grade IV) Male GSM405238 Male
  345. ## Reviewed.histological.diagnosis Age.at.diagnosis KPS Type.of.surgery
  346. ## 1 GBM (grade IV) 57.68 70 Partial resection
  347. ## 2 GBM (grade IV) 71.09 80 Partial resection
  348. ## 3 GBM (grade IV) 52.20 80 Complete resection
  349. ## 4 GBM (grade IV) 51.17 70 Stereotactic biopsy
  350. ## 5 OD (grade III) 38.58 <NA> Complete resection
  351. ## 6 GBM (grade IV) 37.25 80 Stereotactic biopsy
  352. ## Radiotherapy Chemotherapy event time
  353. ## 1 yes no 1 226
  354. ## 2 yes no 1 76
  355. ## 3 yes no 1 375
  356. ## 4 yes no 1 149
  357. ## 5 yes no 1 3255
  358. ## 6 yes no 1 343
  359. identical(surv4$ID,colnames(exp4))
  360. ## [1] TRUE
  361. 检查和保存数据
  362. par(mfrow = c(2,2))
  363. boxplot(exp1[,1:10])
  364. boxplot(exp2[,1:10])
  365. boxplot(exp3[,1:10])
  366. boxplot(exp4[,1:10])
  367. save(exp1,surv1,file = "exp_surv1.Rdata")
  368. save(exp2,surv2,file = "exp_surv2.Rdata")
  369. save(exp3,surv3,file = "exp_surv3.Rdata")
  370. save(exp4,surv4,file = "exp_surv4.Rdata")

image.png

  1. surv_cox_lasso
  2. ER相关的基因
  3. endoplasmic reticulum stress
  4. gs = rio::import("GeneCards-SearchResults.csv")
  5. k = gs$`Relevance score`>=7;table(k)
  6. ## k
  7. ## FALSE TRUE
  8. ## 6267 802
  9. gs = gs[k,]
  10. 两种算法计算每个基因的p值,并取交集
  11. 4个数据需要循环,先拿其中一个数据跑通,再把代码写入循环
  12. library(tinyarray)
  13. load("../1_data_pre/exp_surv1.Rdata")
  14. exp1 = exp1[rownames(exp1) %in% gs$`Gene Symbol`,]
  15. c1 = surv_cox(exp1,surv1,continuous = T)
  16. head(c1)
  17. ## coef se z p HR HRse HRz
  18. ## UBE2J2 1.4404365 0.13446765 10.712141 0.000000e+00 4.222539 0.5677949 5.675533
  19. ## RER1 1.8780151 0.11813991 15.896534 0.000000e+00 6.540509 0.7726952 7.170369
  20. ## ICMT 1.0621919 0.11658641 9.110770 0.000000e+00 2.892705 0.3372500 5.612170
  21. ## PARK7 0.6545554 0.13911446 4.705157 2.536705e-06 1.924287 0.2676961 3.452746
  22. ## H6PD 0.8217466 0.10642461 7.721396 1.154632e-14 2.274469 0.2420595 5.265107
  23. ## FBXO6 0.9195681 0.08969749 10.251882 0.000000e+00 2.508207 0.2249799 6.703741
  24. ## HRp HRCILL HRCIUL
  25. ## UBE2J2 1.382573e-08 3.244252 5.495823
  26. ## RER1 7.479573e-13 5.188606 8.244654
  27. ## ICMT 1.998047e-08 2.301789 3.635320
  28. ## PARK7 5.549107e-04 1.465060 2.527460
  29. ## H6PD 1.401079e-07 1.846253 2.802004
  30. ## FBXO6 2.031497e-11 2.103840 2.990295
  31. nrow(c1)
  32. ## [1] 653
  33. k1 = surv_KM(exp1,surv1)
  34. head(k1) #p值实在太小,逼近0
  35. ## UBE2J2 RER1 FBXO6 DDOST CDC42 SELENON
  36. ## 0 0 0 0 0 0
  37. length(k1)
  38. ## [1] 642
  39. g1 = intersect(rownames(c1),names(k1))
  40. head(g1)
  41. ## [1] "UBE2J2" "RER1" "ICMT" "PARK7" "H6PD" "FBXO6"
  42. length(g1)
  43. ## [1] 615
  44. 循环
  45. load("../1_data_pre/exp_surv2.Rdata")
  46. load("../1_data_pre/exp_surv3.Rdata")
  47. load("../1_data_pre/exp_surv4.Rdata")
  48. exp = list(exp1,exp2,exp3,exp4)
  49. surv = list(surv1,surv2,surv3,surv4)
  50. g = list()
  51. for(i in 1:4){
  52. exp[[i]] = exp[[i]][rownames(exp[[i]]) %in% gs$`Gene Symbol`,]
  53. c1 = surv_cox(exp[[i]],surv[[i]],continuous = T)
  54. k = surv_KM(exp[[i]],surv[[i]])
  55. g[[i]] = intersect(rownames(c1),names(k))
  56. }
  57. names(g) = c("TCGA","CGGA_array","CGGA","GSE16011")
  58. sapply(g, length)
  59. ## TCGA CGGA_array CGGA GSE16011
  60. ## 615 413 551 335
  61. v_plot = draw_venn(g,"")
  62. ggplot2::ggsave(v_plot,filename = "venn.png")
  63. 4个数据,两种算法p<0.05的基因取交集,得到195个基因,用于lasso回归
  64. g = intersect_all(g)
  65. length(g)
  66. ## [1] 195
  67. lasso回归
  68. 使用TCGA的数据构建模型
  69. library(survival)
  70. x = t(exp1[g,])
  71. y = data.matrix(Surv(surv1$time,surv1$event))
  72. library(glmnet)
  73. set.seed(10210)
  74. cvfit = cv.glmnet(x, y, family="cox")
  75. fit=glmnet(x, y, family = "cox")
  76. coef = coef(fit, s = cvfit$lambda.min)
  77. index = which(coef != 0)
  78. actCoef = coef[index]
  79. lassoGene = row.names(coef)[index]
  80. lassoGene
  81. ## [1] "GPX7" "MR1" "SHISA5" "CP" "PPM1L" "DNAJB11"
  82. ## [7] "SNCA" "ANXA5" "HFE" "EPM2A" "FKBP14" "BET1"
  83. ## [13] "SERPINE1" "CASP2" "BAG1" "SIRT1" "DNAJB12" "ERLIN1"
  84. ## [19] "CYP2E1" "CASP4" "SLN" "ERP27" "DDIT3" "ATP2A2"
  85. ## [25] "ERP29" "MYH7" "CDKN3" "MAPK3" "MMP2" "NOL3"
  86. ## [31] "BRCA1" "PRNP" "MX1" "RNF185" "SREBF2" "GLA"
  87. par(mfrow = c(1,2))
  88. plot(cvfit)
  89. plot(fit,xvar="lambda",label = F)
  90. 逐步回归法
  91. library(My.stepwise)
  92. vl <- lassoGene
  93. dat = cbind(surv1,t(exp1[lassoGene,]))
  94. # My.stepwise.coxph(Time = "time",
  95. # Status = "event",
  96. # variable.list = vl,
  97. # data = dat)
  98. model = coxph(formula = Surv(time, event) ~ HFE + SHISA5 + BRCA1 + EPM2A +
  99. ERLIN1 + GPX7 + SLN + DNAJB11 + MMP2 + NOL3 + CP + ATP2A2 +
  100. GLA + MAPK3 + SREBF2 + CASP2 + SNCA + DDIT3 + BAG1 + ANXA5,
  101. data = dat)
  102. summary(model)$concordance
  103. ## C se(C)
  104. ## 0.865988496 0.009754695
  105. genes = names(model$coefficients);length(genes)
  106. ## [1] 20
  107. library(survminer)
  108. ggforest(model,data = dat)
  109. ggsave("forest.png",width = 10,height = 8)
  110. 计算风险评分
  111. dats = list(dat1 = cbind(surv1,t(exp1[genes,])),
  112. dat2 = cbind(surv2,t(exp2[genes,])),
  113. dat3 = cbind(surv3,t(exp3[genes,])),
  114. dat4 = cbind(surv4,t(exp4[genes,])))
  115. library(dplyr)
  116. survs = lapply(dats, function(x){
  117. fp = apply(x[,genes], 1,function(k)sum(model$coefficients * k)) #lasso回归模型的预测值就是线性加乘
  118. # x = x[,!(colnames(x)%in% genes)]
  119. x$fp = fp
  120. x$Risk = ifelse(x$fp<median(x$fp),"low","high")
  121. x$Risk = factor(x$Risk,levels = c("low","high"))
  122. return(x)
  123. })
  124. head(survs[[1]])
  125. ## sample event _PATIENT time TYPE HFE
  126. ## TCGA-06-0878-01A TCGA-06-0878-01A 0 TCGA-06-0878 218 GBM 2.3744248
  127. ## TCGA-26-5135-01A TCGA-26-5135-01A 1 TCGA-26-5135 270 GBM 0.7702822
  128. ## TCGA-06-5859-01A TCGA-06-5859-01A 0 TCGA-06-5859 139 GBM 2.5060929
  129. ## TCGA-06-2563-01A TCGA-06-2563-01A 0 TCGA-06-2563 932 GBM 1.6765673
  130. ## TCGA-41-2571-01A TCGA-41-2571-01A 1 TCGA-41-2571 26 GBM 0.7707545
  131. ## TCGA-28-5207-01A TCGA-28-5207-01A 1 TCGA-28-5207 343 GBM 1.7947694
  132. ## SHISA5 BRCA1 EPM2A ERLIN1 GPX7 SLN
  133. ## TCGA-06-0878-01A 4.891004 1.929324 1.824520 2.963291 4.161208 3.2520898
  134. ## TCGA-26-5135-01A 6.280796 1.535608 1.513929 2.027252 3.812280 3.3372085
  135. ## TCGA-06-5859-01A 4.708155 3.118322 1.696385 2.702454 4.032823 3.6417199
  136. ## TCGA-06-2563-01A 5.343170 1.324959 2.174987 2.690188 4.270150 0.9712369
  137. ## TCGA-41-2571-01A 5.184783 1.082552 1.091940 1.892868 3.539653 2.8842578
  138. ## TCGA-28-5207-01A 5.152544 1.958569 2.323285 2.802503 3.302680 0.9862039
  139. ## DNAJB11 MMP2 NOL3 CP ATP2A2 GLA MAPK3
  140. ## TCGA-06-0878-01A 4.489935 5.196985 3.954610 4.729828 4.931262 3.699737 4.639285
  141. ## TCGA-26-5135-01A 3.934268 5.267932 3.521318 1.514229 5.054068 3.385206 5.064507
  142. ## TCGA-06-5859-01A 3.360617 3.364497 2.896874 3.009394 4.516211 4.166703 4.214156
  143. ## TCGA-06-2563-01A 3.828734 6.471976 1.931305 1.262133 4.879086 3.787763 4.527579
  144. ## TCGA-41-2571-01A 3.083744 8.445514 3.889378 1.410165 4.753264 3.945768 4.887269
  145. ## TCGA-28-5207-01A 3.558798 4.552452 2.013993 1.442487 4.530683 3.779798 4.950130
  146. ## SREBF2 CASP2 SNCA DDIT3 BAG1 ANXA5 fp
  147. ## TCGA-06-0878-01A 3.646879 3.056804 1.750464 5.599779 2.981576 7.777013 12.03249
  148. ## TCGA-26-5135-01A 4.656178 2.837734 2.655945 5.330168 3.299911 7.067037 12.53646
  149. ## TCGA-06-5859-01A 4.619645 2.662097 1.341060 5.621570 3.197839 6.306859 11.79212
  150. ## TCGA-06-2563-01A 4.689714 3.175957 2.339013 4.388975 3.533318 7.995120 10.95347
  151. ## TCGA-41-2571-01A 4.736130 3.262038 2.155570 5.840359 3.408114 6.413893 12.78245
  152. ## TCGA-28-5207-01A 4.002934 3.147226 3.431305 7.208496 3.488568 7.653079 11.76648
  153. ## Risk
  154. ## TCGA-06-0878-01A high
  155. ## TCGA-26-5135-01A high
  156. ## TCGA-06-5859-01A high
  157. ## TCGA-06-2563-01A high
  158. ## TCGA-41-2571-01A high
  159. ## TCGA-28-5207-01A high
  160. save(model,genes,survs,file = "model_genes_survs.Rdata")
  161. survs里的每个数据都是由临床信息、建模基因的表达量、fp(预测值)、Risk(风险分组)构成,整理成了相同格式。
  162. timeROC曲线
  163. library(timeROC)
  164. result = list()
  165. p = list()
  166. for(i in 1:4){
  167. result[[i]] <-with(survs[[i]], timeROC(T=time,
  168. delta=event,
  169. marker=fp,
  170. cause=1,
  171. times=c(365,1095,1825),
  172. iid = TRUE))
  173. dat = data.frame(fpr = as.numeric(result[[i]]$FP),
  174. tpr = as.numeric(result[[i]]$TP),
  175. time = rep(as.factor(c(365,1095,1825)),each = nrow(result[[i]]$TP)))
  176. library(ggplot2)
  177. p[[i]] = ggplot() +
  178. geom_line(data = dat,aes(x = fpr, y = tpr,color = time),size = 1) +
  179. scale_color_manual(name = NULL,values = c("#92C5DE", "#F4A582", "#66C2A5"),
  180. labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
  181. format(round(result[[i]]$AUC,2),nsmall = 2)))+
  182. geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
  183. theme_bw()+
  184. theme(panel.grid = element_blank(),
  185. legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
  186. legend.position = c(0.765,0.125))+
  187. scale_x_continuous(expand = c(0.005,0.005))+
  188. scale_y_continuous(expand = c(0.005,0.005))+
  189. labs(x = "1 - Specificity",
  190. y = "Sensitivity")+
  191. coord_fixed()
  192. }
  193. library(patchwork)
  194. wrap_plots(p)
  195. ggsave("time_ROC.png",height = 10,width = 10)
  196. 高低风险KM_plot
  197. library(survival)
  198. library(survminer)
  199. corhorts = c("TCGA","CGGA_array","CGGA","GSE16011")
  200. splots = list()
  201. for(i in 1:4){
  202. x = survs[[i]]
  203. sfit1 = survfit(Surv(time, event) ~ Risk, data = x)
  204. splots[[i]] = ggsurvplot(sfit1, pval = TRUE, palette = "jco",
  205. data = x, legend = c(0.8, 0.8), title =corhorts[[i]],risk.table = T)
  206. }
  207. png("survs.png",height = 800,width = 800)
  208. arrange_ggsurvplots(splots,nrow = 2)
  209. dev.off()
  210. ## png
  211. ## 2
  212. ### 三图联动
  213. 有缺失timeevent的样本前面没有去掉,画图会出NA,但是动了前面结果就会变化很大,猜测作者是只在画图时去掉了,不参考他的做法,你自己做时整理临床信息时就去掉,这里就不会出NA了。
  214. source("risk_plot.R")
  215. risk_plot(survs[[1]],genes)
  216. risk_plot(survs[[2]],genes)
  217. risk_plot(survs[[3]],genes)
  218. risk_plot(survs[[4]],genes)

for循环
image.png
逐步回归法
image.png
image.png
三图联动
三张图的顺序都一样
image.png
image.png
image.png
image.png免疫亚型

  1. 免疫相关的分析
  2. 生信技能树
  3. 1.抑制cancerimmunity cycle 的基因
  4. http://biocc.hrbmu.edu.cn/TIP/index.jsp
  5. rm(list = ls())
  6. library(tinyarray)
  7. tip = read.delim("TIP_genes.txt",header = T)
  8. nrow(tip)
  9. ## [1] 273
  10. head(tip)
  11. ## GeneSymbol Steps Direction ImmuneCellType
  12. ## 1 IL10 1 positive Multiple
  13. ## 2 TGFB1 1 positive Multiple
  14. ## 3 HMGB1 1 positive Multiple
  15. ## 4 ANXA1 1 positive Multiple
  16. ## 5 CALR 1 positive Multiple
  17. ## 6 CXCL10 1 positive Multiple
  18. gs = tip$GeneSymbol[tip$Direction =="negative"];length(gs)
  19. ## [1] 54
  20. load("../1_data_pre/exp_surv1.Rdata")
  21. load("../1_data_pre/exp_surv2.Rdata")
  22. load("../1_data_pre/exp_surv3.Rdata")
  23. load("../1_data_pre/exp_surv4.Rdata")
  24. load("../2_model/model_genes_survs.Rdata")
  25. exps = list(exp1,exp2,exp3,exp4)
  26. #i = 1
  27. p1 = list()
  28. corhorts = c("TCGA","CGGA_array","CGGA","GSE16011")
  29. for(i in 1:4){
  30. n = exps[[i]][rownames(exps[[i]])%in% gs,]
  31. n = n[,order(survs[[i]]$fp)]
  32. g = sort(survs[[i]]$Risk)
  33. rn = apply(n,1,function(x){
  34. #x = n[1,]
  35. p = wilcox.test(x~g)$p.value
  36. j = ifelse(p<=0.001,
  37. "***",
  38. ifelse(p<=0.01,
  39. "**",
  40. ifelse(p<=0.05,"*","")))
  41. return(j)
  42. })
  43. rownames(n) = paste0(rownames(n),rn)
  44. p1[[i]] =draw_heatmap(n,g,cluster_cols = F,main = corhorts[[i]],show_rownames = T)
  45. }
  46. library(patchwork)
  47. wrap_plots(p1)
  48. 箱线图
  49. g4 = c("VEGFA","TGFB1","CD95L","CD70")
  50. p2 = list()
  51. library(ggplot2)
  52. for(i in 1:4){
  53. p2[[i]] = draw_boxplot(exps[[i]][rownames(exps[[i]])%in% g4,],survs[[i]]$Risk,sort = F)+
  54. ggtitle(corhorts[[i]])+
  55. theme(plot.title = element_text(hjust = 0.5))
  56. }
  57. library(ggplot2)
  58. wrap_plots(p2) + plot_layout(guides = "collect") &
  59. theme(legend.position='right')
  60. 没有找到CD95L基因。这里仅作例子,自己文章中用到的重要基因,可以查查别名。
  61. 2.免疫细胞丰度
  62. ssgsea方法。mmc3.xlsx出自这篇文章的TableS6:https://www.sciencedirect.com/science/article/pii/S2211124716317090
  63. geneset = rio::import("mmc3.xlsx",skip = 2)
  64. geneset = split(geneset$Metagene,geneset$`Cell type`)
  65. lapply(geneset[1:3], head)
  66. ## $`Activated B cell`
  67. ## [1] "ADAM28" "CD180" "CD79B" "BLK" "CD19" "MS4A1"
  68. ##
  69. ## $`Activated CD4 T cell`
  70. ## [1] "AIM2" "BIRC3" "BRIP1" "CCL20" "CCL4" "CCL5"
  71. ##
  72. ## $`Activated CD8 T cell`
  73. ## [1] "ADRM1" "AHSA1" "C1GALT1C1" "CCT6B" "CD37" "CD3D"
  74. library(GSVA)
  75. f = "ssgsea_result.Rdata"
  76. if(!file.exists(f)){
  77. p = list()
  78. for(i in 1:4){
  79. #i = 1
  80. re <- gsva(exps[[i]], geneset, method="ssgsea",
  81. mx.diff=FALSE, verbose=FALSE
  82. )
  83. p[[i]] = draw_boxplot(re,survs[[i]]$Risk)+
  84. ggtitle(corhorts[[i]])+
  85. theme(plot.title = element_text(hjust = 0.5))+
  86. theme(plot.margin=unit(c(1,3,1,1),'lines')) #调整边距
  87. }
  88. save(re,p,file = f)
  89. }
  90. load(f)
  91. wrap_plots(p)+ plot_layout(guides = "collect") &
  92. theme(legend.position='right')
  93. 免疫亚型
  94. #devtools::install_github("Gibbsdavidl/ImmuneSubtypeClassifier")
  95. library(ImmuneSubtypeClassifier)
  96. f = "immu_res.Rdata"
  97. if(!file.exists(f)){
  98. res = list()
  99. for(i in 1:4){
  100. a <- callEnsemble(X = exps[[i]], geneids = 'symbol')[,1:2]
  101. a$corhort = corhorts[[i]]
  102. a$Risk = survs[[i]]$Risk
  103. res[[i]] = a
  104. }
  105. save(res,file = f)
  106. }
  107. load(f)
  108. 我检查了一下这个方法和xena整理结果是否一致
  109. a1 = res[[1]]
  110. head(a1)
  111. ## SampleIDs BestCall corhort Risk
  112. ## 1 TCGA-06-0878-01A 4 TCGA high
  113. ## 2 TCGA-26-5135-01A 4 TCGA high
  114. ## 3 TCGA-06-5859-01A 4 TCGA high
  115. ## 4 TCGA-06-2563-01A 4 TCGA high
  116. ## 5 TCGA-41-2571-01A 4 TCGA high
  117. ## 6 TCGA-28-5207-01A 4 TCGA high
  118. a1$SampleIDs = str_sub(a1$SampleIDs,1,15)
  119. a2 = read.delim("Subtype_Immune_Model_Based.txt.gz")
  120. head(a2)
  121. ## sample Subtype_Immune_Model_Based
  122. ## 1 TCGA-A5-A0GI-01 Wound Healing (Immune C1)
  123. ## 2 TCGA-S9-A7J2-01 Immunologically Quiet (Immune C5)
  124. ## 3 TCGA-EK-A2RE-01 IFN-gamma Dominant (Immune C2)
  125. ## 4 TCGA-D5-5538-01 IFN-gamma Dominant (Immune C2)
  126. ## 5 TCGA-F4-6854-01 Wound Healing (Immune C1)
  127. ## 6 TCGA-AX-A2H7-01 Wound Healing (Immune C1)
  128. table(a1$SampleIDs %in% a2$sample)
  129. ##
  130. ## FALSE TRUE
  131. ## 34 657
  132. a = merge(a1,a2,by.x = "SampleIDs",by.y = "sample")
  133. a$check = str_extract(a$Subtype_Immune_Model_Based,"\\d")
  134. table(a$check==a$BestCall)
  135. ##
  136. ## FALSE TRUE
  137. ## 59 598
  138. 大部分一致吧。但还是有一些出入的
  139. res2 = do.call(rbind,res)
  140. head(res2)
  141. ## SampleIDs BestCall corhort Risk
  142. ## 1 TCGA-06-0878-01A 4 TCGA high
  143. ## 2 TCGA-26-5135-01A 4 TCGA high
  144. ## 3 TCGA-06-5859-01A 4 TCGA high
  145. ## 4 TCGA-06-2563-01A 4 TCGA high
  146. ## 5 TCGA-41-2571-01A 4 TCGA high
  147. ## 6 TCGA-28-5207-01A 4 TCGA high
  148. res2$BestCall = factor(paste0("C",res2$BestCall))
  149. library(dplyr)
  150. dat2 <- res2 %>%
  151. group_by(corhort,Risk, BestCall) %>%
  152. summarise(count = n())
  153. library(paletteer)
  154. pb = ggplot() + geom_bar(data =dat2, aes(x = Risk, y = count, fill = BestCall),
  155. stat = "identity",
  156. position = "fill")+
  157. theme_classic()+
  158. scale_fill_paletteer_d("RColorBrewer::Set2")+
  159. labs(x = "Risk",y = "Percentage")+
  160. facet_wrap(~corhort,nrow = 1)
  161. pb
  162. 免疫检查点基因
  163. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7786136/
  164. # 别名 B7-H3 是CD276;PD-L1是CD274;PD-1是PDCD1,PDCD1LG2是PD-L2
  165. tmp = readLines("immune_checkpoint.txt")
  166. genes = str_split(tmp,", ")[[1]]
  167. sapply(exps, function(x){genes[!(genes %in% rownames(x))]})
  168. ## [[1]]
  169. ## character(0)
  170. ##
  171. ## [[2]]
  172. ## character(0)
  173. ##
  174. ## [[3]]
  175. ## [1] "BTNL2" "IDO2" "KIR3DL1"
  176. ##
  177. ## [[4]]
  178. ## [1] "IDO2"
  179. pi = list()
  180. for(i in 1:4){
  181. n = exps[[i]][rownames(exps[[i]])%in% genes,]
  182. n = n[,order(survs[[i]]$fp)]
  183. g = sort(survs[[i]]$Risk)
  184. rn = apply(n,1,function(x){
  185. #x = n[1,]
  186. p = wilcox.test(x~g)$p.value
  187. j = ifelse(p<=0.001,
  188. "***",
  189. ifelse(p<=0.01,
  190. "**",
  191. ifelse(p<=0.05,"*","")))
  192. return(j)
  193. })
  194. rownames(n) = paste0(rownames(n),rn)
  195. pi[[i]] =draw_heatmap(n,g,cluster_cols = F,main = corhorts[[i]],show_rownames = T)
  196. }
  197. library(patchwork)
  198. wrap_plots(pi)
  199. 相关性与箱线图
  200. #i = 1
  201. library(ggpubr)
  202. pc = list()
  203. for(i in 1:4){
  204. dat = data.frame(PDCD1 = exps[[i]]["PDCD1",],
  205. CD274 = exps[[i]]["CD274",],
  206. risk_score = survs[[i]]$fp,
  207. Risk = survs[[i]]$Risk)
  208. p1 = ggscatter( dat,
  209. y = "PDCD1", x = "risk_score",
  210. xlab = "Risk score",
  211. size =1,color = "#1d4a9d",
  212. add = "reg.line",
  213. add.params = list(color = "red"))+
  214. stat_cor(label.y = 5)
  215. p2 = ggscatter( dat,
  216. y = "CD274", x = "risk_score",
  217. xlab = "Risk score",
  218. size =1,color = "#1d4a9d",
  219. add = "reg.line",
  220. add.params = list(color = "red"),
  221. title = corhorts[[i]])+
  222. stat_cor(label.y = 5)
  223. p3 = draw_boxplot(t(dat[,1:2]),dat$Risk)
  224. pc[[i]] = list(p1,p2,p3)
  225. }
  226. wrap_plots(c(pc[[1]],pc[[2]],pc[[3]],pc[[4]]),nrow = 2)

image.png
image.png
image.png
image.png
image.png

文献4