模型可视化
生信技能树
1.简版森林图
#https://zhuanlan.zhihu.com/p/109401314
rm(list = ls())
load("../1_data_pre/exp_surv1.Rdata")
rm(surv1)
load("../3_anno_immu/TCGA_idh.Rdata")
library(stringr)
surv1 = idh[,c("event",
                 "time",
                 "fp",
                 "IDH",
                 "age",
                 "gender",
                 "grade",
                 "Meth",
                 "1p19q")]
colnames(surv1)[3] = "riskScore"
library(survival)
library(survminer)
model1 = coxph(Surv(time,event)~.,data = surv1)
ggforest(model1,data = surv1)
coxdat = surv1
table(coxdat$IDH)
## 
## Mutant     WT 
##    435    246
coxdat$IDH = ifelse(coxdat$IDH=="WT",1,2)
table(coxdat$gender)
## 
## female   male 
##    267    364
coxdat$gender = ifelse(coxdat$gender=="female",1,2)
table(coxdat$grade)
## 
##  G2  G3  G4 
## 222 242 167
coxdat$grade = ifelse(coxdat$grade=="G2",2,
                      ifelse(coxdat$grade=="G3",3,4))
table(coxdat$Meth)
## 
##   Methylated Unmethylated 
##          488          165
coxdat$Meth = ifelse(coxdat$Meth=="Unmethylated",1,2)
str(coxdat)
## 'data.frame':    691 obs. of  9 variables:
##  $ event    : int  1 1 0 0 0 1 1 1 1 1 ...
##  $ time     : int  441 74 459 463 485 1427 1427 1009 388 759 ...
##  $ riskScore: num  10.9 11.3 10.1 11 11.7 ...
##  $ IDH      : num  1 1 2 1 1 1 1 2 1 1 ...
##  $ age      : num  78 62 43 53 64 63 63 30 54 49 ...
##  $ gender   : num  2 1 2 2 2 1 1 2 2 2 ...
##  $ grade    : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ Meth     : num  1 1 2 1 1 2 2 2 1 NA ...
##  $ 1p19q    : chr  "non-codel" "non-codel" "non-codel" "non-codel" ...
coxdat$`1p19q` = ifelse(coxdat$`1p19q`=="non-codel",1,2)
model = coxph(Surv(time,event)~.,data = coxdat)
ggforest(model,data = coxdat)
2.美化版森林图
单因素cox
library(tinyarray)
dat1 = surv_cox(t(coxdat[,3:8]),coxdat,continuous = T,pvalue_cutoff = 1)
dat2 = as.data.frame(dat1[,c("HR","HRCILL","HRCIUL","p")])
datp = format(round(dat2[,1:3],3),nsmall = 3)
dat2$Trait = rownames(dat1)
dat2$HR2 = paste0(datp[, 1], "(", datp[, 2], "-", datp[, 3], ")")
str(dat2)
## 'data.frame':    6 obs. of  6 variables:
##  $ HR    : num  2.688 0.115 1.072 1.101 4.776 ...
##  $ HRCILL: num  2.3899 0.0835 1.0602 0.8323 3.7853 ...
##  $ HRCIUL: num  3.023 0.158 1.084 1.456 6.025 ...
##  $ p     : num  0 0 0 0.501 0 ...
##  $ Trait : chr  "riskScore" "IDH" "age" "gender" ...
##  $ HR2   : chr  "2.688(2.390-3.023)" "0.115(0.083-0.158)" "1.072(1.060-1.084)" "1.101(0.832-1.456)" ...
dat2$p = ifelse(dat2$p<0.001,"<0.001",format(round(dat2$p,3),nsmall = 3))
library(forestplot)
forestplot(
  dat2[, c(5, 4, 6)],
  mean = dat2[, 1],
  lower = dat2[, 2],
  upper = dat2[, 3],
  zero = 1,
  boxsize = 0.2,
  col = fpColors(box = '#1075BB', lines = 'black', zero = 'grey'),
  lty.ci = "solid",
  graph.pos = 4
)
多因素cox
m = summary(model)
p = ifelse(
  m$coefficients[, 5] <= 0.001,
  "<0.001",
  ifelse(
    m$coefficients[, 5] <= 0.01,
    "<0.01",
    ifelse(
      m$coefficients[, 5] <= 0.05,
      paste(round(m$coefficients[, 5], 3), " *"),
      round(m$coefficients[, 5], 3)
    )
  )
)
#HR和它的置信区间
dat2 = as.data.frame(round(m$conf.int[, c(1, 3, 4)], 2))
dat2 = tibble::rownames_to_column(dat2, var = "Trait")
colnames(dat2)[2:4] = c("HR", "lower", "upper")
#需要在图上显示的HR文字和p值
dat2$HR2 = paste0(dat2[, 2], "(", dat2[, 3], "-", dat2[, 4], ")")
dat2$p = p
str(dat2)
## 'data.frame':    7 obs. of  6 variables:
##  $ Trait: chr  "riskScore" "IDH" "age" "gender" ...
##  $ HR   : num  2.43 1.22 1.05 1.29 0.97 0.72 0.68
##  $ lower: num  1.96 0.68 1.03 0.93 0.69 0.5 0.37
##  $ upper: num  3.02 2.19 1.06 1.8 1.35 1.05 1.24
##  $ HR2  : chr  "2.43(1.96-3.02)" "1.22(0.68-2.19)" "1.05(1.03-1.06)" "1.29(0.93-1.8)" ...
##  $ p    : chr  "<0.001" "0.507" "<0.001" "0.129" ...
forestplot(
  dat2[, c(1, 5, 6)],
  mean = dat2[, 2],
  lower = dat2[, 3],
  upper = dat2[, 4],
  zero = 1,
  boxsize = 0.4,
  col = fpColors(box = '#1075BB', lines = 'black', zero = 'grey'),
  lty.ci = "solid",
  graph.pos = 4
)
3.诺模图
library(rms)
dd<-datadist(surv1)
options(datadist="dd")
mod <- cph(as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
           data=surv1,x=T,y=T,surv = T)
surv<-Survival(mod) 
m1<-function(x) surv(365,x)
m3<-function(x) surv(1095,x)
m5<-function(x) surv(1825,x)
x<-nomogram(mod,
            fun = list(m1,m3,m5),
            funlabel = c('1-y survival',
                         '3-y survival',
                         '5-y survival'),
            lp = F)
plot(x)
奇怪的结果
sfit1 = survival::survfit(survival::Surv(time, event) ~ 
            grade, data = surv1)
survminer::ggsurvplot(sfit1, pval = TRUE, palette = "jco", data = surv1, legend = c(0.8, 0.8))
对此的进一步探索：https://www.yuque.com/xiaojiewanglezenmofenshen/bsgk2d/ci7604
f1 <- cph(formula =  as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
           data=surv1,x=T,y=T,surv = T, time.inc=365)
cal1 <- calibrate(f1, cmethod="KM", method="boot", u=365, m=50, B=1000)
## Using Cox survival estimates at  365 Days
f3 <- cph(formula =  as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
           data=surv1,x=T,y=T,surv = T, time.inc=1095)
cal3 <- calibrate(f3, cmethod="KM", method="boot", u=1095, m=50, B=1000)
## Using Cox survival estimates at 1095 Days
f5 <- cph(formula =  as.formula(paste("Surv(time, event) ~ ",paste(colnames(surv1)[3:8],collapse = "+"))),
           data=surv1,x=T,y=T,surv = T,  time.inc=1825)
cal5 <- calibrate(f5, cmethod="KM", method="boot", u=1825, m=50, B=1000)
## Using Cox survival estimates at 1825 Days
plot(cal1,lwd = 2,lty = 0,errbar.col = c("#92C5DE"),
     bty = "l", #只画左边和下边框
     xlim = c(0,1),ylim= c(0,1),
     xlab = "Nomogram-prediced OS (%)",ylab = "Observed OS (%)",
     col = c("#92C5DE"),
     cex.lab=1.2,cex.axis=1, cex.main=1.2, cex.sub=0.6)
lines(cal3[,c('mean.predicted',"KM")],
      type = 'b', lwd = 2, col = c("#92C5DE"), pch = 16)
mtext("")
plot(cal3,lwd = 2,lty = 0,errbar.col = c("#F4A582"),
     bty = "l", #只画左边和下边框
     xlim = c(0,1),ylim= c(0,1),
     xlab = "Nomogram-prediced OS (%)",ylab = "Observed OS (%)",
     col = c("#F4A582"),
     cex.lab=1.2,cex.axis=1, cex.main=1.2, cex.sub=0.6)
lines(cal3[,c('mean.predicted',"KM")],
      type = 'b', lwd = 2, col = c("#F4A582"), pch = 16)
mtext("")
plot(cal5,lwd = 2,lty = 0,errbar.col = c("#66C2A5"),
     xlim = c(0,1),ylim= c(0,1),col = c("#66C2A5"),add = T)
lines(cal5[,c('mean.predicted',"KM")],
      type = 'b', lwd = 2, col = c("#66C2A5"), pch = 16)
abline(0,1, lwd = 2, lty = 3, col = c("#224444"))
legend("bottomright", #图例的位置
       legend = c("1-y survival","3-y survival","5-y survival"), #图例文字
       col = c("#92C5DE", "#F4A582", "#66C2A5"), #图例线的颜色，与文字对应
       lwd = 2,#图例中线的粗细
       cex = 1.2)
4.DCA曲线
#devtools::install_github('yikeshu0611/ggDCA')
dat = surv1
dat$nomo = predict(model1,newdata = dat)
library(ggDCA)
library(rms)
nomo  <- cph(Surv(time,event)~nomo,dat)
IDH   <- cph(Surv(time,event)~IDH,dat)
Grade <- cph(Surv(time,event)~grade,dat)
data  <- dca(nomo,IDH,Grade)
ggplot(data)

文献1 肺癌

该文献问题比较多，注意入坑

配对样本的数据，注意配色、图标

突变分析

文献2 黑色素瘤

寻找上述资料
xuna

其他方法计算肿瘤纯度

免疫浸润：Timer 或 cibersort 或 ssgsea

文献3 胶质瘤

图好看，但逻辑有些问题

step1_数据收集
1.数据整理的说明
1.1 下载的数据
四个数据的表达矩阵、生存信息表格。
TCGA的LGG和GBM是分开的。后续用代码合并到一起。
1.2 表达矩阵的要求
(1)就要是取过log、没有异常值的矩阵，标准化过的也可以，因为不做差异分析。
(2)如果是转录组数据，要log之后的标准化数据。（因为这里只做生存分析，不做差异分析，所以没有必要找count数据）
(3)行名都转换成gene symbol
(4)只要tumor样本，不要normal
1.3 临床信息的要求
(1)要有每个样本对应的生存时间和结局事件，并用代码调整顺序，与表达矩阵的样本顺序相同，严格一一对应。
(2)为了后续代码统一，把生存时间和结局事件列名统一起来，都叫“time”和“event”。
(3)至于时间的单位，反正是分别计算的，统不统一无所谓。
1.4 关于数据来源的说明
数据来源于哪个数据库，不是主要矛盾。重点是分清楚是芯片数据还是转录组数据，是芯片的话要能找到探针注释。如果是转录组数据，则需要分清楚是否标准化，进行了哪种标准化。反正是分别分析，又不需要合并，所以直接使用即可。
2.TCGA的RNA-seq数据
Table S1显示样本数量691，TCGA的GBM数据173样本，LGG有529样本。
2.1 TCGA表达矩阵
exp_gbm = read.table("TCGA-GBM.htseq_fpkm.tsv.gz",
                     header = T,row.names = 1,check.names = F)
exp_lgg = read.table("TCGA-LGG.htseq_fpkm.tsv.gz",
                     header = T,row.names = 1,check.names = F)
exp1 = as.matrix(cbind(exp_gbm,exp_lgg))
#行名转换为gene symbol
library(stringr)
library(AnnoProbe)
rownames(exp1) = str_split(rownames(exp1),"\\.",simplify = T)[,1];head(rownames(exp1))
## [1] "ENSG00000242268" "ENSG00000270112" "ENSG00000167578" "ENSG00000273842"
## [5] "ENSG00000078237" "ENSG00000146083"
re = annoGene(rownames(exp1),ID_type = "ENSEMBL");head(re)
##        SYMBOL                           biotypes         ENSEMBL  chr start
## 1     DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
## 2      WASH7P             unprocessed_pseudogene ENSG00000227232 chr1 14404
## 3   MIR6859-1                              miRNA ENSG00000278267 chr1 17369
## 4 MIR1302-2HG                             lncRNA ENSG00000243485 chr1 29554
## 6     FAM138A                             lncRNA ENSG00000237613 chr1 34554
## 7      OR4G4P             unprocessed_pseudogene ENSG00000268020 chr1 52473
##     end
## 1 14409
## 2 29570
## 3 17436
## 4 31109
## 6 36081
## 7 53312
library(tinyarray)
exp1 = trans_array(exp1,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp1[1:4,1:4]
##             TCGA-06-0878-01A TCGA-26-5135-01A TCGA-06-5859-01A TCGA-06-2563-01A
## DDX11L1            0.0000000        0.0221755       0.00000000       0.01797799
## WASH7P             0.6673943        1.7429940       0.65334641       1.57566444
## MIR6859-1          1.2006708        2.1880585       1.51229174       2.17610561
## MIR1302-2HG        0.0000000        0.0000000       0.02520857       0.00000000
dim(exp1)
## [1] 56514   702
#删除正常样本
Group = make_tcga_group(exp1)
table(Group)
## Group
## normal  tumor 
##      5    697
exp1 = exp1[,Group == "tumor"]
#过滤表达量0值太多的基因
exp1 = exp1[apply(exp1, 1, function(x){sum(x>0)>200}),]
2.2 TCGA生存信息
surv_gbm = readr::read_tsv("TCGA-GBM.survival.tsv")
surv_gbm$TYPE = "GBM"
surv_lgg = readr::read_tsv("TCGA-LGG.survival.tsv")
surv_lgg$TYPE = "LGG"
surv1 = rbind(surv_gbm,surv_lgg)
head(surv1)
## # A tibble: 6 x 5
##   sample              OS `_PATIENT`   OS.time TYPE 
##   <chr>            <dbl> <chr>          <dbl> <chr>
## 1 TCGA-12-0657-01A     1 TCGA-12-0657       3 GBM  
## 2 TCGA-32-1977-01A     0 TCGA-32-1977       3 GBM  
## 3 TCGA-19-1791-01A     0 TCGA-19-1791       4 GBM  
## 4 TCGA-28-1757-01A     0 TCGA-28-1757       4 GBM  
## 5 TCGA-19-2624-01A     1 TCGA-19-2624       5 GBM  
## 6 TCGA-41-4097-01A     1 TCGA-41-4097       6 GBM
table(colnames(exp1) %in% surv1$sample)
## 
## FALSE  TRUE 
##     6   691
s = intersect(colnames(exp1),surv1$sample)
exp1 = exp1[,s]
surv1 = surv1[match(s,surv1$sample),]
colnames(surv1)[c(2,4)] = c("event","time")
exp1[1:4,1:4]
##            TCGA-06-0878-01A TCGA-26-5135-01A TCGA-06-5859-01A TCGA-06-2563-01A
## WASH7P           0.66739425       1.74299400       0.65334641       1.57566444
## MIR6859-1        1.20067078       2.18805848       1.51229174       2.17610561
## AL627309.1       0.00000000       0.02066275       0.01386984       0.00000000
## CICP27           0.09691981       0.01013527       0.00000000       0.02449211
head(surv1)
## # A tibble: 6 x 5
##   sample           event `_PATIENT`    time TYPE 
##   <chr>            <dbl> <chr>        <dbl> <chr>
## 1 TCGA-06-0878-01A     0 TCGA-06-0878   218 GBM  
## 2 TCGA-26-5135-01A     1 TCGA-26-5135   270 GBM  
## 3 TCGA-06-5859-01A     0 TCGA-06-5859   139 GBM  
## 4 TCGA-06-2563-01A     0 TCGA-06-2563   932 GBM  
## 5 TCGA-41-2571-01A     1 TCGA-41-2571    26 GBM  
## 6 TCGA-28-5207-01A     1 TCGA-28-5207   343 GBM
identical(colnames(exp1),surv1$sample)
## [1] TRUE
共691个有生存信息的tumor样本。
3.CGGA芯片数据整理
exp2 = read.table("CGGA/CGGA.mRNA_array_301_gene_level.20200506.txt",header = T,row.names = 1)
surv2 = read.table("CGGA/CGGA.mRNA_array_301_clinical.20200506.txt",sep = "\t",header = T,check.names = F)
head(surv2)
##    CGGA_ID TCGA_subtypes PRS_type Histology  Grade Gender Age   OS
## 1  CGGA_11     Classical  Primary       GBM WHO IV Female  57  155
## 2 CGGA_124   Mesenchymal  Primary       GBM WHO IV   Male  53  414
## 3 CGGA_126   Mesenchymal  Primary       GBM WHO IV Female  50 1177
## 4 CGGA_168   Mesenchymal  Primary       GBM WHO IV   Male  17 3086
## 5 CGGA_172   Mesenchymal  Primary       GBM WHO IV Female  57  462
## 6 CGGA_195   Mesenchymal  Primary       GBM WHO IV   Male  48  486
##   Censor (alive=0; dead=1) Radio_status (treated=1;un-treated=0)
## 1                        1                                     1
## 2                        1                                     1
## 3                        1                                     1
## 4                        0                                     1
## 5                        1                                     1
## 6                        1                                     1
##   Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
## 1                                         0            Wildtype
## 2                                         0            Wildtype
## 3                                         1            Wildtype
## 4                                         1            Wildtype
## 5                                         0            Wildtype
## 6                                         1            Wildtype
##   1p19q_Codeletion_status MGMTp_methylation_status
## 1                    <NA>            un-methylated
## 2                    <NA>            un-methylated
## 3                    <NA>            un-methylated
## 4                    <NA>            un-methylated
## 5                    <NA>            un-methylated
## 6                    <NA>            un-methylated
s = intersect(surv2$CGGA_ID,colnames(exp2))
exp2 = as.matrix(exp2[,s])
surv2 = surv2[match(s,surv2$CGGA_ID),]
colnames(surv2)[c(9,8)] = c("event","time")
exp2[1:4,1:4]
##          CGGA_11  CGGA_124  CGGA_126   CGGA_168
## A1BG  -0.3603635 0.5649519 0.3047719  0.1749144
## A1CF   2.3413600 1.1707935 2.2081513  0.9652286
## A2BP1  0.0345194 1.0964074 0.3024883  0.0949111
## A2LD1 -0.9130564 0.5108800 0.4253669 -0.1949377
head(surv2)
##    CGGA_ID TCGA_subtypes PRS_type Histology  Grade Gender Age time event
## 1  CGGA_11     Classical  Primary       GBM WHO IV Female  57  155     1
## 2 CGGA_124   Mesenchymal  Primary       GBM WHO IV   Male  53  414     1
## 3 CGGA_126   Mesenchymal  Primary       GBM WHO IV Female  50 1177     1
## 4 CGGA_168   Mesenchymal  Primary       GBM WHO IV   Male  17 3086     0
## 5 CGGA_172   Mesenchymal  Primary       GBM WHO IV Female  57  462     1
## 6 CGGA_195   Mesenchymal  Primary       GBM WHO IV   Male  48  486     1
##   Radio_status (treated=1;un-treated=0)
## 1                                     1
## 2                                     1
## 3                                     1
## 4                                     1
## 5                                     1
## 6                                     1
##   Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
## 1                                         0            Wildtype
## 2                                         0            Wildtype
## 3                                         1            Wildtype
## 4                                         1            Wildtype
## 5                                         0            Wildtype
## 6                                         1            Wildtype
##   1p19q_Codeletion_status MGMTp_methylation_status
## 1                    <NA>            un-methylated
## 2                    <NA>            un-methylated
## 3                    <NA>            un-methylated
## 4                    <NA>            un-methylated
## 5                    <NA>            un-methylated
## 6                    <NA>            un-methylated
identical(surv2$CGGA_ID,colnames(exp2))
## [1] TRUE
4.CGGA转录组数据整理
exp3 = read.table("CGGA/CGGA.mRNAseq_325.RSEM-genes.20200506.txt",header = T,row.names = 1)
exp3 = as.matrix(log2(exp3+1))
surv3 = read.table("CGGA/CGGA.mRNAseq_325_clinical.20200506.txt",sep = "\t",header = T,check.names = F)
head(surv3)
##     CGGA_ID  PRS_type Histology   Grade Gender Age   OS
## 1 CGGA_1001   Primary       GBM  WHO IV   Male  11 3817
## 2 CGGA_1006   Primary        AA WHO III   Male  42  254
## 3 CGGA_1007   Primary       GBM  WHO IV Female  57  345
## 4 CGGA_1011   Primary       GBM  WHO IV Female  46  109
## 5 CGGA_1015   Primary       GBM  WHO IV   Male  62  164
## 6 CGGA_1019 Recurrent      rGBM  WHO IV   Male  60  212
##   Censor (alive=0; dead=1) Radio_status (treated=1;un-treated=0)
## 1                        0                                     0
## 2                        1                                     1
## 3                        1                                     1
## 4                        1                                     1
## 5                        1                                     1
## 6                        1                                     0
##   Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
## 1                                         1            Wildtype
## 2                                         1            Wildtype
## 3                                         1            Wildtype
## 4                                         0            Wildtype
## 5                                         0            Wildtype
## 6                                         1            Wildtype
##   1p19q_codeletion_status MGMTp_methylation_status
## 1               Non-codel            un-methylated
## 2               Non-codel            un-methylated
## 3               Non-codel            un-methylated
## 4               Non-codel            un-methylated
## 5               Non-codel            un-methylated
## 6               Non-codel               methylated
colnames(surv3)[c(8,7)] = c("event","time")
s = intersect(surv3$CGGA_ID,colnames(exp3))
exp3 = exp3[,s]
surv3 = surv3[match(s,surv3$CGGA_ID),]
exp3[1:4,1:4]
##          CGGA_1001 CGGA_1006 CGGA_1007 CGGA_1011
## A1BG      3.769772 3.0054000  4.958379  2.933573
## A1BG-AS1  1.641546 1.0908534  2.933573  2.411426
## A2M       8.826294 6.7487296  7.698357  9.467952
## A2M-AS1   2.104337 0.1763228  0.704872  1.384050
head(surv3)
##     CGGA_ID  PRS_type Histology   Grade Gender Age time event
## 1 CGGA_1001   Primary       GBM  WHO IV   Male  11 3817     0
## 2 CGGA_1006   Primary        AA WHO III   Male  42  254     1
## 3 CGGA_1007   Primary       GBM  WHO IV Female  57  345     1
## 4 CGGA_1011   Primary       GBM  WHO IV Female  46  109     1
## 5 CGGA_1015   Primary       GBM  WHO IV   Male  62  164     1
## 6 CGGA_1019 Recurrent      rGBM  WHO IV   Male  60  212     1
##   Radio_status (treated=1;un-treated=0)
## 1                                     0
## 2                                     1
## 3                                     1
## 4                                     1
## 5                                     1
## 6                                     0
##   Chemo_status (TMZ treated=1;un-treated=0) IDH_mutation_status
## 1                                         1            Wildtype
## 2                                         1            Wildtype
## 3                                         1            Wildtype
## 4                                         0            Wildtype
## 5                                         0            Wildtype
## 6                                         1            Wildtype
##   1p19q_codeletion_status MGMTp_methylation_status
## 1               Non-codel            un-methylated
## 2               Non-codel            un-methylated
## 3               Non-codel            un-methylated
## 4               Non-codel            un-methylated
## 5               Non-codel            un-methylated
## 6               Non-codel               methylated
identical(surv3$CGGA_ID,colnames(exp3))
## [1] TRUE
我没仔细看这个表达矩阵具体是哪种标准化，反正这个数据只是作为例子，我们就不深究到底是啥了，直接拿来用。注意如果是自己要用来发文章的数据，是要看清楚的，没标准化需要自行标准化，cpm，tmp，fpkm都行。
5.GSE16011的整理
library(tinyarray)
geo = geo_download("GSE16011")
library(stringr)
#只要tumor样本
k = str_detect(geo$pd$title,"glioma");table(k)
## k
## FALSE  TRUE 
##     8   276
geo$exp = geo$exp[,k]
geo$pd = geo$pd[k,]
# 把行名转换为gene symbol
gpl = GEOquery::getGEO(filename = "GPL8542_family.soft.gz",destdir = ".")
ids  = gpl@dataTable@table[,1:2]
library(clusterProfiler)
library(org.Hs.eg.db)
e2s = bitr(ids$ORF,fromType = "ENTREZID",toType = "SYMBOL",OrgDb = "org.Hs.eg.db")
ids = merge(ids,e2s,by.x = "ORF",by.y = "ENTREZID")
ids = ids[,2:3]
colnames(ids) = c("probe_id","symbol")
exp4 = trans_array(geo$exp,ids)
surv4 = geo$pd[,c(1,4,7,9)]
exp4[1:4,1:4]
##      GSM405201 GSM405202 GSM405203 GSM405204
## A1BG  7.219401  6.031261  6.707681  6.330666
## A2M  12.206626 13.260587 12.352693 13.044375
## NAT1  5.182989  7.213272  6.570733  6.343490
## NAT2  5.239842  5.260373  5.697435  4.859682
head(surv4)
##               title age at diagnosis:ch1  histology:ch1 gender:ch1
## GSM405201  glioma 8                44.57 OD (grade III)     Female
## GSM405202  glioma 9                28.69 OD (grade III)       Male
## GSM405203 glioma 11                38.58 OD (grade III)       Male
## GSM405204 glioma 13                33.89 OD (grade III)       Male
## GSM405205 glioma 20                48.03 OD (grade III)       Male
## GSM405206 glioma 21                31.53 OD (grade III)       Male
identical(rownames(surv4),colnames(exp4))
## [1] TRUE
临床信息表格里没有生存信息，所以从文章附件里找
从文章附件里提取临床信息表格
https://aacrjournals.org/cancerres/article/69/23/9065/553005/Intrinsic-Gene-Expression-Profiles-of-Gliomas-Are
包非常难装，需要配置java，但这个需求不常用，直接跳过这个过程。
if(!file.exists("exp_surv4.Rdata")){
  library(tabulizer)
  f <- "00085472can092307-sup-stabs_1-6.pdf"
  re <- extract_tables(f,pages = 1:10) #提取前10页的表格。
  str(re)
  re = do.call(rbind,re)
  re[1:4,1:4]
  colnames(re) = re[1,]
  re = re[-1,]
  re = data.frame(re)
  re[re==""]=NA
  library(readr)
  re$Survival..years. = parse_double(re$Survival..years.,locale = locale(decimal_mark = ","))
  re$Age.at.diagnosis = parse_double(re$Age.at.diagnosis,locale = locale(decimal_mark = ","))
  dim(exp4)
  k = re$Reviewed.histological.diagnosis!="control";table(k)
  re = re[k,]
  re$Database.number = paste("glioma",re$Database.number)
  surv4$ID = rownames(surv4)
  surv4 = merge(surv4,re,by.x = "title",by.y = "Database.number")
  colnames(surv4)[13:14] = c("event","time")
  table(surv4$event)
  surv4$time = as.integer(surv4$time*365) #天为单位
  surv4$event[surv4$event=="Lost to\rfollow up"]=NA
  table(surv4$event)
  surv4$event=ifelse(surv4$event=="Alive",0,1)
  head(surv4)
  s = intersect(surv4$ID,colnames(exp4))
  exp4 = exp4[,s]
  surv4 = surv4[match(s,surv4$ID),]
  save(exp4,surv4,file = "exp_surv4.Rdata")
}
load("exp_surv4.Rdata")
exp4[1:4,1:4]
##      GSM405234 GSM405235 GSM405236 GSM405237
## A1BG  7.613150  6.995666  7.337393  6.287498
## A2M  13.166457 13.003225 13.814942 12.991301
## NAT1  5.581672  7.470894  7.845210  6.088651
## NAT2  5.206464  4.618245  4.881652  5.011908
head(surv4)
##        title age at diagnosis:ch1  histology:ch1 gender:ch1        ID Gender
## 1 glioma 101                57.68 GBM (grade IV)     Female GSM405234 Female
## 2 glioma 104                71.09 GBM (grade IV)       Male GSM405235   Male
## 3 glioma 105                52.20 GBM (grade IV)       Male GSM405236   Male
## 4 glioma 107                51.17 GBM (grade IV)       Male GSM405237   Male
## 5  glioma 11                38.58 OD (grade III)       Male GSM405203   Male
## 6 glioma 111                37.25 GBM (grade IV)       Male GSM405238   Male
##   Reviewed.histological.diagnosis Age.at.diagnosis  KPS     Type.of.surgery
## 1                  GBM (grade IV)            57.68   70   Partial resection
## 2                  GBM (grade IV)            71.09   80   Partial resection
## 3                  GBM (grade IV)            52.20   80  Complete resection
## 4                  GBM (grade IV)            51.17   70 Stereotactic biopsy
## 5                  OD (grade III)            38.58 <NA>  Complete resection
## 6                  GBM (grade IV)            37.25   80 Stereotactic biopsy
##   Radiotherapy Chemotherapy event time
## 1          yes           no     1  226
## 2          yes           no     1   76
## 3          yes           no     1  375
## 4          yes           no     1  149
## 5          yes           no     1 3255
## 6          yes           no     1  343
identical(surv4$ID,colnames(exp4))
## [1] TRUE
检查和保存数据
par(mfrow = c(2,2))
boxplot(exp1[,1:10])
boxplot(exp2[,1:10])
boxplot(exp3[,1:10])
boxplot(exp4[,1:10])
save(exp1,surv1,file = "exp_surv1.Rdata")
save(exp2,surv2,file = "exp_surv2.Rdata")
save(exp3,surv3,file = "exp_surv3.Rdata")
save(exp4,surv4,file = "exp_surv4.Rdata")

surv_cox_lasso
ER相关的基因
endoplasmic reticulum stress
gs = rio::import("GeneCards-SearchResults.csv")
k = gs$`Relevance score`>=7;table(k)
## k
## FALSE  TRUE 
##  6267   802
gs = gs[k,]
两种算法计算每个基因的p值，并取交集
4个数据需要循环，先拿其中一个数据跑通，再把代码写入循环
library(tinyarray)
load("../1_data_pre/exp_surv1.Rdata")
exp1 = exp1[rownames(exp1) %in% gs$`Gene Symbol`,]
c1 = surv_cox(exp1,surv1,continuous = T)
head(c1)
##             coef         se         z            p       HR      HRse      HRz
## UBE2J2 1.4404365 0.13446765 10.712141 0.000000e+00 4.222539 0.5677949 5.675533
## RER1   1.8780151 0.11813991 15.896534 0.000000e+00 6.540509 0.7726952 7.170369
## ICMT   1.0621919 0.11658641  9.110770 0.000000e+00 2.892705 0.3372500 5.612170
## PARK7  0.6545554 0.13911446  4.705157 2.536705e-06 1.924287 0.2676961 3.452746
## H6PD   0.8217466 0.10642461  7.721396 1.154632e-14 2.274469 0.2420595 5.265107
## FBXO6  0.9195681 0.08969749 10.251882 0.000000e+00 2.508207 0.2249799 6.703741
##                 HRp   HRCILL   HRCIUL
## UBE2J2 1.382573e-08 3.244252 5.495823
## RER1   7.479573e-13 5.188606 8.244654
## ICMT   1.998047e-08 2.301789 3.635320
## PARK7  5.549107e-04 1.465060 2.527460
## H6PD   1.401079e-07 1.846253 2.802004
## FBXO6  2.031497e-11 2.103840 2.990295
nrow(c1)
## [1] 653
k1 = surv_KM(exp1,surv1)
head(k1) #p值实在太小，逼近0
##  UBE2J2    RER1   FBXO6   DDOST   CDC42 SELENON 
##       0       0       0       0       0       0
length(k1)
## [1] 642
g1 = intersect(rownames(c1),names(k1))
head(g1)
## [1] "UBE2J2" "RER1"   "ICMT"   "PARK7"  "H6PD"   "FBXO6"
length(g1)
## [1] 615
循环
load("../1_data_pre/exp_surv2.Rdata")
load("../1_data_pre/exp_surv3.Rdata")
load("../1_data_pre/exp_surv4.Rdata")
exp = list(exp1,exp2,exp3,exp4)
surv = list(surv1,surv2,surv3,surv4)
g = list()
for(i in 1:4){
  exp[[i]] = exp[[i]][rownames(exp[[i]]) %in% gs$`Gene Symbol`,]
  c1 = surv_cox(exp[[i]],surv[[i]],continuous = T)
  k = surv_KM(exp[[i]],surv[[i]])
  g[[i]] = intersect(rownames(c1),names(k))
}
names(g) = c("TCGA","CGGA_array","CGGA","GSE16011")
sapply(g, length)
##       TCGA CGGA_array       CGGA   GSE16011 
##        615        413        551        335
v_plot = draw_venn(g,"")
ggplot2::ggsave(v_plot,filename = "venn.png")
 4个数据,两种算法p<0.05的基因取交集，得到195个基因，用于lasso回归
g = intersect_all(g)
length(g)
## [1] 195
lasso回归
使用TCGA的数据构建模型
library(survival)
x = t(exp1[g,])
y = data.matrix(Surv(surv1$time,surv1$event)) 
library(glmnet)
set.seed(10210)
cvfit = cv.glmnet(x, y, family="cox") 
fit=glmnet(x, y, family = "cox") 
coef = coef(fit, s = cvfit$lambda.min) 
index = which(coef != 0) 
actCoef = coef[index] 
lassoGene = row.names(coef)[index] 
lassoGene
##  [1] "GPX7"     "MR1"      "SHISA5"   "CP"       "PPM1L"    "DNAJB11" 
##  [7] "SNCA"     "ANXA5"    "HFE"      "EPM2A"    "FKBP14"   "BET1"    
## [13] "SERPINE1" "CASP2"    "BAG1"     "SIRT1"    "DNAJB12"  "ERLIN1"  
## [19] "CYP2E1"   "CASP4"    "SLN"      "ERP27"    "DDIT3"    "ATP2A2"  
## [25] "ERP29"    "MYH7"     "CDKN3"    "MAPK3"    "MMP2"     "NOL3"    
## [31] "BRCA1"    "PRNP"     "MX1"      "RNF185"   "SREBF2"   "GLA"
par(mfrow = c(1,2))
plot(cvfit) 
plot(fit,xvar="lambda",label = F)
逐步回归法
library(My.stepwise)
vl <- lassoGene
dat = cbind(surv1,t(exp1[lassoGene,]))
# My.stepwise.coxph(Time = "time",
#                   Status = "event",
#                   variable.list = vl,
#                   data = dat)
model = coxph(formula = Surv(time, event) ~ HFE + SHISA5 + BRCA1 + EPM2A + 
    ERLIN1 + GPX7 + SLN + DNAJB11 + MMP2 + NOL3 + CP + ATP2A2 + 
    GLA + MAPK3 + SREBF2 + CASP2 + SNCA + DDIT3 + BAG1 + ANXA5, 
    data = dat)
summary(model)$concordance
##           C       se(C) 
## 0.865988496 0.009754695
genes = names(model$coefficients);length(genes)
## [1] 20
library(survminer)
ggforest(model,data = dat)
ggsave("forest.png",width = 10,height = 8)
计算风险评分
dats  = list(dat1 = cbind(surv1,t(exp1[genes,])),
             dat2 = cbind(surv2,t(exp2[genes,])),
             dat3 = cbind(surv3,t(exp3[genes,])),
             dat4 = cbind(surv4,t(exp4[genes,])))
library(dplyr)
survs = lapply(dats, function(x){
  fp = apply(x[,genes], 1,function(k)sum(model$coefficients * k)) #lasso回归模型的预测值就是线性加乘
  # x = x[,!(colnames(x)%in% genes)]
  x$fp = fp
  x$Risk = ifelse(x$fp<median(x$fp),"low","high")
  x$Risk = factor(x$Risk,levels = c("low","high"))
  return(x)
})
head(survs[[1]])
##                            sample event     _PATIENT time TYPE       HFE
## TCGA-06-0878-01A TCGA-06-0878-01A     0 TCGA-06-0878  218  GBM 2.3744248
## TCGA-26-5135-01A TCGA-26-5135-01A     1 TCGA-26-5135  270  GBM 0.7702822
## TCGA-06-5859-01A TCGA-06-5859-01A     0 TCGA-06-5859  139  GBM 2.5060929
## TCGA-06-2563-01A TCGA-06-2563-01A     0 TCGA-06-2563  932  GBM 1.6765673
## TCGA-41-2571-01A TCGA-41-2571-01A     1 TCGA-41-2571   26  GBM 0.7707545
## TCGA-28-5207-01A TCGA-28-5207-01A     1 TCGA-28-5207  343  GBM 1.7947694
##                    SHISA5    BRCA1    EPM2A   ERLIN1     GPX7       SLN
## TCGA-06-0878-01A 4.891004 1.929324 1.824520 2.963291 4.161208 3.2520898
## TCGA-26-5135-01A 6.280796 1.535608 1.513929 2.027252 3.812280 3.3372085
## TCGA-06-5859-01A 4.708155 3.118322 1.696385 2.702454 4.032823 3.6417199
## TCGA-06-2563-01A 5.343170 1.324959 2.174987 2.690188 4.270150 0.9712369
## TCGA-41-2571-01A 5.184783 1.082552 1.091940 1.892868 3.539653 2.8842578
## TCGA-28-5207-01A 5.152544 1.958569 2.323285 2.802503 3.302680 0.9862039
##                   DNAJB11     MMP2     NOL3       CP   ATP2A2      GLA    MAPK3
## TCGA-06-0878-01A 4.489935 5.196985 3.954610 4.729828 4.931262 3.699737 4.639285
## TCGA-26-5135-01A 3.934268 5.267932 3.521318 1.514229 5.054068 3.385206 5.064507
## TCGA-06-5859-01A 3.360617 3.364497 2.896874 3.009394 4.516211 4.166703 4.214156
## TCGA-06-2563-01A 3.828734 6.471976 1.931305 1.262133 4.879086 3.787763 4.527579
## TCGA-41-2571-01A 3.083744 8.445514 3.889378 1.410165 4.753264 3.945768 4.887269
## TCGA-28-5207-01A 3.558798 4.552452 2.013993 1.442487 4.530683 3.779798 4.950130
##                    SREBF2    CASP2     SNCA    DDIT3     BAG1    ANXA5       fp
## TCGA-06-0878-01A 3.646879 3.056804 1.750464 5.599779 2.981576 7.777013 12.03249
## TCGA-26-5135-01A 4.656178 2.837734 2.655945 5.330168 3.299911 7.067037 12.53646
## TCGA-06-5859-01A 4.619645 2.662097 1.341060 5.621570 3.197839 6.306859 11.79212
## TCGA-06-2563-01A 4.689714 3.175957 2.339013 4.388975 3.533318 7.995120 10.95347
## TCGA-41-2571-01A 4.736130 3.262038 2.155570 5.840359 3.408114 6.413893 12.78245
## TCGA-28-5207-01A 4.002934 3.147226 3.431305 7.208496 3.488568 7.653079 11.76648
##                  Risk
## TCGA-06-0878-01A high
## TCGA-26-5135-01A high
## TCGA-06-5859-01A high
## TCGA-06-2563-01A high
## TCGA-41-2571-01A high
## TCGA-28-5207-01A high
save(model,genes,survs,file = "model_genes_survs.Rdata")
survs里的每个数据都是由临床信息、建模基因的表达量、fp(预测值)、Risk(风险分组)构成，整理成了相同格式。
timeROC曲线
library(timeROC)
result = list()
p = list()
for(i in 1:4){
  result[[i]] <-with(survs[[i]], timeROC(T=time,
                     delta=event,
                     marker=fp,
                     cause=1,
                     times=c(365,1095,1825),
                     iid = TRUE))
  dat = data.frame(fpr = as.numeric(result[[i]]$FP),
                   tpr = as.numeric(result[[i]]$TP),
                   time = rep(as.factor(c(365,1095,1825)),each = nrow(result[[i]]$TP)))
  library(ggplot2)
  p[[i]] = ggplot() + 
    geom_line(data = dat,aes(x = fpr, y = tpr,color = time),size = 1) + 
    scale_color_manual(name = NULL,values = c("#92C5DE", "#F4A582", "#66C2A5"),
                       labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
                                       format(round(result[[i]]$AUC,2),nsmall = 2)))+
    geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
    theme_bw()+
    theme(panel.grid = element_blank(),
          legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
          legend.position = c(0.765,0.125))+
    scale_x_continuous(expand = c(0.005,0.005))+
    scale_y_continuous(expand = c(0.005,0.005))+
    labs(x = "1 - Specificity",
         y = "Sensitivity")+
    coord_fixed()
  }
library(patchwork)
wrap_plots(p)
ggsave("time_ROC.png",height = 10,width = 10)
高低风险KM_plot
library(survival)
library(survminer)
corhorts = c("TCGA","CGGA_array","CGGA","GSE16011")
splots = list()
for(i in 1:4){
  x = survs[[i]]
  sfit1 = survfit(Surv(time, event) ~ Risk, data = x)
  splots[[i]] =  ggsurvplot(sfit1, pval = TRUE, palette = "jco", 
            data = x, legend = c(0.8, 0.8), title =corhorts[[i]],risk.table = T)
}
png("survs.png",height = 800,width = 800)
arrange_ggsurvplots(splots,nrow = 2)
dev.off()
## png 
##   2
 ### 三图联动
有缺失time和event的样本前面没有去掉，画图会出NA，但是动了前面结果就会变化很大，猜测作者是只在画图时去掉了，不参考他的做法，你自己做时整理临床信息时就去掉，这里就不会出NA了。
source("risk_plot.R")
risk_plot(survs[[1]],genes)
risk_plot(survs[[2]],genes)
risk_plot(survs[[3]],genes)
risk_plot(survs[[4]],genes)

for循环

逐步回归法

三图联动
三张图的顺序都一样

免疫亚型

 免疫相关的分析
生信技能树
1.抑制cancer‐immunity cycle 的基因
http://biocc.hrbmu.edu.cn/TIP/index.jsp
rm(list = ls())
library(tinyarray)
tip = read.delim("TIP_genes.txt",header = T)
nrow(tip)
## [1] 273
head(tip)
##   GeneSymbol Steps Direction ImmuneCellType
## 1       IL10     1  positive       Multiple
## 2      TGFB1     1  positive       Multiple
## 3      HMGB1     1  positive       Multiple
## 4      ANXA1     1  positive       Multiple
## 5       CALR     1  positive       Multiple
## 6     CXCL10     1  positive       Multiple
gs = tip$GeneSymbol[tip$Direction =="negative"];length(gs)
## [1] 54
load("../1_data_pre/exp_surv1.Rdata")
load("../1_data_pre/exp_surv2.Rdata")
load("../1_data_pre/exp_surv3.Rdata")
load("../1_data_pre/exp_surv4.Rdata")
load("../2_model/model_genes_survs.Rdata")
exps = list(exp1,exp2,exp3,exp4)
#i = 1
p1 = list()
corhorts = c("TCGA","CGGA_array","CGGA","GSE16011")
for(i in 1:4){
  n = exps[[i]][rownames(exps[[i]])%in% gs,]
  n = n[,order(survs[[i]]$fp)]
  g = sort(survs[[i]]$Risk)
  rn = apply(n,1,function(x){
    #x = n[1,]
    p = wilcox.test(x~g)$p.value
    j = ifelse(p<=0.001,
           "***",
           ifelse(p<=0.01,
                  "**",
                  ifelse(p<=0.05,"*","")))
    return(j)
  })
  rownames(n) = paste0(rownames(n),rn)
  p1[[i]] =draw_heatmap(n,g,cluster_cols = F,main = corhorts[[i]],show_rownames = T)
}
library(patchwork)
wrap_plots(p1)
箱线图
g4 = c("VEGFA","TGFB1","CD95L","CD70")
p2 = list()
library(ggplot2)
for(i in 1:4){
  p2[[i]] = draw_boxplot(exps[[i]][rownames(exps[[i]])%in% g4,],survs[[i]]$Risk,sort = F)+
    ggtitle(corhorts[[i]])+
    theme(plot.title = element_text(hjust = 0.5)) 
}
library(ggplot2)
wrap_plots(p2) + plot_layout(guides = "collect") &
  theme(legend.position='right')
没有找到CD95L基因。这里仅作例子，自己文章中用到的重要基因，可以查查别名。
2.免疫细胞丰度
用ssgsea方法。mmc3.xlsx出自这篇文章的TableS6:https://www.sciencedirect.com/science/article/pii/S2211124716317090
geneset = rio::import("mmc3.xlsx",skip = 2)
geneset = split(geneset$Metagene,geneset$`Cell type`)
lapply(geneset[1:3], head)
## $`Activated B cell`
## [1] "ADAM28" "CD180"  "CD79B"  "BLK"    "CD19"   "MS4A1" 
## 
## $`Activated CD4 T cell`
## [1] "AIM2"  "BIRC3" "BRIP1" "CCL20" "CCL4"  "CCL5" 
## 
## $`Activated CD8 T cell`
## [1] "ADRM1"     "AHSA1"     "C1GALT1C1" "CCT6B"     "CD37"      "CD3D"
library(GSVA)
f = "ssgsea_result.Rdata"
if(!file.exists(f)){
  p = list()
for(i in 1:4){
  #i = 1
  re <- gsva(exps[[i]], geneset, method="ssgsea",
               mx.diff=FALSE, verbose=FALSE
)
  p[[i]] = draw_boxplot(re,survs[[i]]$Risk)+
    ggtitle(corhorts[[i]])+
    theme(plot.title = element_text(hjust = 0.5))+ 
    theme(plot.margin=unit(c(1,3,1,1),'lines')) #调整边距
}
  save(re,p,file = f)
}
load(f)
wrap_plots(p)+ plot_layout(guides = "collect") &
  theme(legend.position='right')
免疫亚型
#devtools::install_github("Gibbsdavidl/ImmuneSubtypeClassifier")
library(ImmuneSubtypeClassifier)
f = "immu_res.Rdata"
if(!file.exists(f)){
  res = list()
for(i in 1:4){
  a <- callEnsemble(X = exps[[i]], geneids = 'symbol')[,1:2]
  a$corhort = corhorts[[i]]
  a$Risk = survs[[i]]$Risk
  res[[i]] = a
}
  save(res,file = f)
}
load(f)
我检查了一下这个方法和xena整理结果是否一致
a1 = res[[1]]
head(a1)
##          SampleIDs BestCall corhort Risk
## 1 TCGA-06-0878-01A        4    TCGA high
## 2 TCGA-26-5135-01A        4    TCGA high
## 3 TCGA-06-5859-01A        4    TCGA high
## 4 TCGA-06-2563-01A        4    TCGA high
## 5 TCGA-41-2571-01A        4    TCGA high
## 6 TCGA-28-5207-01A        4    TCGA high
a1$SampleIDs = str_sub(a1$SampleIDs,1,15)
a2 = read.delim("Subtype_Immune_Model_Based.txt.gz")
head(a2)
##            sample        Subtype_Immune_Model_Based
## 1 TCGA-A5-A0GI-01         Wound Healing (Immune C1)
## 2 TCGA-S9-A7J2-01 Immunologically Quiet (Immune C5)
## 3 TCGA-EK-A2RE-01    IFN-gamma Dominant (Immune C2)
## 4 TCGA-D5-5538-01    IFN-gamma Dominant (Immune C2)
## 5 TCGA-F4-6854-01         Wound Healing (Immune C1)
## 6 TCGA-AX-A2H7-01         Wound Healing (Immune C1)
table(a1$SampleIDs %in% a2$sample)
## 
## FALSE  TRUE 
##    34   657
a = merge(a1,a2,by.x = "SampleIDs",by.y = "sample")
a$check = str_extract(a$Subtype_Immune_Model_Based,"\\d")
table(a$check==a$BestCall)
## 
## FALSE  TRUE 
##    59   598
大部分一致吧。但还是有一些出入的
res2 = do.call(rbind,res)
head(res2)
##          SampleIDs BestCall corhort Risk
## 1 TCGA-06-0878-01A        4    TCGA high
## 2 TCGA-26-5135-01A        4    TCGA high
## 3 TCGA-06-5859-01A        4    TCGA high
## 4 TCGA-06-2563-01A        4    TCGA high
## 5 TCGA-41-2571-01A        4    TCGA high
## 6 TCGA-28-5207-01A        4    TCGA high
res2$BestCall = factor(paste0("C",res2$BestCall))
library(dplyr)
dat2 <- res2 %>%
 group_by(corhort,Risk, BestCall) %>% 
 summarise(count = n())
library(paletteer)
pb = ggplot() + geom_bar(data =dat2, aes(x = Risk, y = count, fill = BestCall),
 stat = "identity",
 position = "fill")+
  theme_classic()+
  scale_fill_paletteer_d("RColorBrewer::Set2")+
  labs(x = "Risk",y = "Percentage")+
  facet_wrap(~corhort,nrow = 1)
pb
免疫检查点基因
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7786136/
# 别名 B7-H3 是CD276；PD-L1是CD274;PD-1是PDCD1，PDCD1LG2是PD-L2
tmp = readLines("immune_checkpoint.txt")
genes = str_split(tmp,", ")[[1]]
sapply(exps, function(x){genes[!(genes %in% rownames(x))]})
## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "BTNL2"   "IDO2"    "KIR3DL1"
## 
## [[4]]
## [1] "IDO2"
pi = list()
for(i in 1:4){
  n = exps[[i]][rownames(exps[[i]])%in% genes,]
  n = n[,order(survs[[i]]$fp)]
  g = sort(survs[[i]]$Risk)
  rn = apply(n,1,function(x){
    #x = n[1,]
    p = wilcox.test(x~g)$p.value
    j = ifelse(p<=0.001,
           "***",
           ifelse(p<=0.01,
                  "**",
                  ifelse(p<=0.05,"*","")))
    return(j)
  })
  rownames(n) = paste0(rownames(n),rn)
  pi[[i]] =draw_heatmap(n,g,cluster_cols = F,main = corhorts[[i]],show_rownames = T)
}
library(patchwork)
wrap_plots(pi)
相关性与箱线图
#i = 1
library(ggpubr)
pc = list()
for(i in 1:4){
  dat = data.frame(PDCD1 = exps[[i]]["PDCD1",],
                  CD274 = exps[[i]]["CD274",],
                  risk_score = survs[[i]]$fp,
                  Risk = survs[[i]]$Risk)
  p1 = ggscatter( dat, 
             y = "PDCD1", x = "risk_score",
             xlab = "Risk score",
             size =1,color = "#1d4a9d",
             add = "reg.line", 
            add.params = list(color = "red"))+
    stat_cor(label.y = 5)
  p2 = ggscatter( dat, 
             y = "CD274", x = "risk_score",
             xlab = "Risk score",
             size =1,color = "#1d4a9d",
             add = "reg.line", 
            add.params = list(color = "red"),
            title = corhorts[[i]])+
    stat_cor(label.y = 5)
  p3 = draw_boxplot(t(dat[,1:2]),dat$Risk)
  pc[[i]] = list(p1,p2,p3)
}
wrap_plots(c(pc[[1]],pc[[2]],pc[[3]],pc[[4]]),nrow = 2)

生信技能树线上直播课（学习记录ing）

文献复现

文献1 肺癌

文献2 黑色素瘤

文献3 胶质瘤

文献4