image.png
image.png
image.png

image.png
event需要是0、1
image.png

生存分析前的数据整理

  1. 表达矩阵只需要tumor数据,不要normal,将其去掉,新表达矩阵数据命名为exprSet
  2. 临床信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta
  3. rm(list=ls())
  4. proj = "TCGA-KIRC"
  5. load(paste0(proj,".Rdata"))
  6. library(stringr)
  7. 1.整理表达矩阵
  8. 1.1去除normal样本
  9. table(Group)
  10. #> Group
  11. #> normal tumor
  12. #> 72 535
  13. exprSet = exp[,Group=='tumor']
  14. ncol(exp)
  15. #> [1] 607
  16. ncol(exprSet)
  17. #> [1] 535
  18. 1.2 基因过滤
  19. 再次进行基因过滤.
  20. (1)标准1:至少要在50%的样本里表达量大于0(最低标准)。
  21. exp(600样本)满足“至少在300个样本里表达量>0”,不能等同于 exprSet(500样本)满足“至少在250个样本里表达量>0
  22. k = apply(exprSet,1, function(x){sum(x>0)>0.5*ncol(exprSet)});table(k)
  23. #> k
  24. #> FALSE TRUE
  25. #> 155 31798
  26. exprSet = exprSet[k,]
  27. nrow(exprSet)
  28. #> [1] 31798
  29. (2)标准2:至少在一半以上样本里表达量>10(其他数字也可,酌情调整)
  30. k = apply(exprSet,1, function(x){sum(x>10)>0.5*ncol(exprSet)});table(k)
  31. #> k
  32. #> FALSE TRUE
  33. #> 11718 20080
  34. exprSet = exprSet[k,]
  35. nrow(exprSet)
  36. #> [1] 20080
  37. 1.3 使用logCPMlogTPM数据
  38. exprSet[1:4,1:4]
  39. #> TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
  40. #> WASH7P 77 69 23 29
  41. #> AL627309.6 282 36 26 88
  42. #> WASH9P 222 147 74 47
  43. #> AP006222.1 77 23 23 14
  44. exprSet=log2(edgeR::cpm(exprSet)+1)
  45. exprSet[1:4,1:4]
  46. #> TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
  47. #> WASH7P 1.049412 1.0031558 0.5304210 0.5510009
  48. #> AL627309.6 2.297940 0.6078858 0.5871819 1.2698370
  49. #> WASH9P 2.029993 1.6506568 1.2807483 0.8104724
  50. #> AP006222.1 1.049412 0.4166163 0.5304210 0.2922303
  51. 1.4 样本与病人
  52. 有的病人会有两个或两个以上的肿瘤样本,就有重复。两种可行的办法:
  53. 1)以病人为中心,对表达矩阵的列按照病人ID去重复,每个病人只保留一个样本。
  54. exprSet = exprSet[,sort(colnames(exprSet))]
  55. k = !duplicated(str_sub(colnames(exprSet),1,12));table(k)
  56. #> k
  57. #> FALSE TRUE
  58. #> 5 530
  59. exprSet = exprSet[,k]
  60. ncol(exprSet)
  61. #> [1] 530
  62. 2)以样本为中心,如果每个病人有多个样本则全部保留。(删掉上面这一段代码即可)
  63. 2.整理生存信息和临床信息
  64. 2.1数据合并
  65. xena单独整理了生存信息,和临床信息是分开存放的。后续构建模型可能会需要纳入一些临床信息,所以要合并到一起。
  66. library(dplyr)
  67. nrow(surv)
  68. #> [1] 979
  69. nrow(clinical)
  70. #> [1] 985
  71. meta = left_join(surv,clinical,by = c("sample"= "submitter_id.samples"))
  72. nrow(surv)
  73. #> [1] 979
  74. 2.2 样本过滤
  75. 去掉生存信息不全或者生存时间小于30天的样本,样本纳排标准不唯一,且差别很大
  76. k1 = meta$OS.time >= 30;table(k1)
  77. #> k1
  78. #> FALSE TRUE
  79. #> 20 959
  80. k2 = !(is.na(meta$OS.time)|is.na(meta$OS));table(k2)
  81. #> k2
  82. #> TRUE
  83. #> 979
  84. meta = meta[k1&k2,]
  85. 2.3 选列、简化列名
  86. 小技巧:搜索列名
  87. tmp = data.frame(colnames(meta))
  88. #View(tmp)
  89. 选择需要的列,简化列名
  90. meta = meta[,c(
  91. 'sample',
  92. 'OS',
  93. 'OS.time',
  94. 'race.demographic',
  95. 'age_at_initial_pathologic_diagnosis',
  96. 'gender.demographic' ,
  97. 'tumor_stage.diagnoses'
  98. )]
  99. colnames(meta)=c('ID','event','time','race','age','gender','stage')
  100. str(meta)
  101. #> 'data.frame': 959 obs. of 7 variables:
  102. #> $ ID : chr "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-CW-6087-11A" ...
  103. #> $ event : int 0 0 1 1 1 1 1 1 0 1 ...
  104. #> $ time : int 36 38 41 41 42 42 43 43 43 51 ...
  105. #> $ race : chr "white" "white" "white" "white" ...
  106. #> $ age : int 62 59 61 61 75 75 65 65 35 72 ...
  107. #> $ gender: chr "male" "male" "male" "male" ...
  108. #> $ stage : chr "stage ii" "stage i" "stage iv" "stage iv" ...
  109. 2.4 简化、规范内容
  110. 结局事件
  111. 生存分析的输入数据里,要求结局事件必须用01表示,1表示阳性结局。
  112. xena的数据是整理好的,其他来源的需要自行检查和整理。
  113. table(meta$event)
  114. #>
  115. #> 0 1
  116. #> 622 337
  117. #自行整理的例子:
  118. a = c("Dead", "Dead", "Alive", "Dead", "Alive")
  119. ifelse(a=="Alive",0,1)
  120. #> [1] 1 1 0 1 0
  121. 生存时间
  122. 认清生存时间的单位(通常是月,也可以用天和年);
  123. range(meta$time)
  124. #> [1] 36 4537
  125. meta$time = meta$time/30
  126. range(meta$time)
  127. #> [1] 1.2000 151.2333
  128. (3)检查各列的信息是否规范,没有冗余信息。缺失的信息用NA代替
  129. table(meta$stage)
  130. #>
  131. #> not reported stage i stage ii stage iii stage iv
  132. #> 4 467 101 226 161
  133. meta$stage = meta$stage %>%
  134. str_replace("not reported",as.character(NA)) %>%
  135. str_remove("stage ") %>%
  136. str_to_upper()
  137. table(meta$stage,useNA = "always")
  138. #>
  139. #> I II III IV <NA>
  140. #> 467 101 226 161 4
  141. table(meta$race)
  142. #>
  143. #> asian black or african american not reported
  144. #> 16 69 14
  145. #> white
  146. #> 860
  147. meta$race = meta$race %>%
  148. str_replace("not reported",as.character(NA))
  149. table(meta$race,useNA = "always")
  150. #>
  151. #> asian black or african american white
  152. #> 16 69 860
  153. #> <NA>
  154. #> 14
  155. table(meta$gender,useNA = "always")
  156. #>
  157. #> female male <NA>
  158. #> 327 632 0
  159. str(meta)
  160. #> 'data.frame': 959 obs. of 7 variables:
  161. #> $ ID : chr "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-CW-6087-11A" ...
  162. #> $ event : int 0 0 1 1 1 1 1 1 0 1 ...
  163. #> $ time : num 1.2 1.27 1.37 1.37 1.4 ...
  164. #> $ race : chr "white" "white" "white" "white" ...
  165. #> $ age : int 62 59 61 61 75 75 65 65 35 72 ...
  166. #> $ gender: chr "male" "male" "male" "male" ...
  167. #> $ stage : chr "II" "I" "IV" "IV" ...
  168. 3.实现表达矩阵与临床信息的匹配
  169. 即:meta的每一行与exprSet每一列一一对应
  170. rownames(meta) <- meta$ID
  171. s = intersect(rownames(meta),colnames(exprSet));length(s)
  172. #> [1] 513
  173. exprSet = exprSet[,s]
  174. meta = meta[s,]
  175. dim(exprSet)
  176. #> [1] 20080 513
  177. dim(meta)
  178. #> [1] 513 7
  179. identical(rownames(meta),colnames(exprSet))
  180. #> [1] TRUE
  181. save(meta,exprSet,proj,file = paste0(proj,"_sur_model.Rdata")) #将_sur_model.Rdata弄好

实操

  1. 1.准备输入数据
  2. rm(list = ls())
  3. proj = "TCGA-KIRC"
  4. load(paste0(proj,"_sur_model.Rdata"))
  5. ls()
  6. #> [1] "exprSet" "meta" "proj"
  7. exprSet[1:4,1:4]
  8. #> TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
  9. #> WASH7P 0.38330266 1.0823971 0.24141414 0.8200756
  10. #> AL627309.6 0.04121953 0.1330091 0.60612548 1.7747381
  11. #> WASH9P 0.95751073 1.8118992 0.66255436 1.7390920
  12. #> AP006222.1 0.15825052 0.2713707 0.08503973 0.5589702
  13. str(meta)
  14. #> 'data.frame': 513 obs. of 7 variables:
  15. #> $ ID : chr "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-B0-4698-01A" ...
  16. #> $ event : int 0 0 1 1 1 0 1 0 1 0 ...
  17. #> $ time : num 1.2 1.27 1.37 1.4 1.43 ...
  18. #> $ race : chr "white" "white" "white" "white" ...
  19. #> $ age : int 62 59 61 75 65 35 72 50 84 76 ...
  20. #> $ gender: chr "male" "male" "male" "male" ...
  21. #> $ stage : chr "II" "I" "IV" "IV" ...
  22. 2.KM-plot
  23. 简单版本和进阶版本
  24. library(survival)
  25. library(survminer)
  26. sfit <- survfit(Surv(time, event)~gender, data=meta)
  27. ggsurvplot(sfit,pval=TRUE)
  28. ggsurvplot(sfit,
  29. palette = "jco",
  30. risk.table =TRUE,
  31. pval =TRUE,
  32. conf.int =TRUE)
  33. 连续型信息怎么作KM分析?例如年龄,基因?
  34. 连续型数据的离散化
  35. 年龄
  36. group = ifelse(meta$age>median(meta$age,na.rm = T),"older","younger")
  37. table(group)
  38. #> group
  39. #> older younger
  40. #> 251 262
  41. sfit=survfit(Surv(time, event)~group, data=meta)
  42. ggsurvplot(sfit,pval =TRUE, data = meta, risk.table = TRUE)
  43. 基因
  44. g = rownames(exprSet)[1];g
  45. #> [1] "WASH7P"
  46. meta$gene = ifelse(exprSet[g,]> median(exprSet[g,]),'high','low')
  47. sfit=survfit(Surv(time, event)~gene, data=meta)
  48. ggsurvplot(sfit,pval =TRUE, data = meta, risk.table = TRUE)

image.png

image.png
image.png

  1. 2.log-rank test
  2. KMp值是log-rank test得出的,可批量操作
  3. source("KM_cox_function.R")
  4. logrankfile = paste0(proj,"_log_rank_p.Rdata")
  5. if(!file.exists(logrankfile)){
  6. log_rank_p <- apply(exprSet , 1 , geneKM)
  7. log_rank_p=sort(log_rank_p)
  8. save(log_rank_p,file = logrankfile)
  9. }
  10. load(logrankfile)
  11. table(log_rank_p<0.01)
  12. #>
  13. #> FALSE TRUE
  14. #> 13023 7057
  15. table(log_rank_p<0.05)
  16. #>
  17. #> FALSE TRUE
  18. #> 10232 9848
  19. 3.批量单因素cox
  20. coxfile = paste0(proj,"_cox.Rdata")
  21. if(!file.exists(coxfile)){
  22. cox_results <-apply(exprSet , 1 , genecox)
  23. cox_results=as.data.frame(t(cox_results))
  24. save(cox_results,file = coxfile)
  25. }
  26. load(coxfile)
  27. table(cox_results$p<0.01)
  28. #>
  29. #> FALSE TRUE
  30. #> 11911 8169
  31. table(cox_results$p<0.05)
  32. #>
  33. #> FALSE TRUE
  34. #> 9373 10707
  35. lr = names(log_rank_p)[log_rank_p<0.01];length(lr)
  36. #> [1] 7057
  37. cox = rownames(cox_results)[cox_results$p<0.01];length(cox)
  38. #> [1] 8169
  39. length(intersect(lr,cox))
  40. #> [1] 6165
  41. save(lr,cox,file = paste0(proj,"_logrank_cox_gene.Rdata"))

KM-cox

  1. geneKM = function(gene){
  2. meta$group=ifelse(gene>median(gene),'high','low')
  3. data.survdiff=survdiff(Surv(time, event)~group,data=meta)
  4. p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
  5. return(p.val)
  6. }
  7. genecox = function(gene){
  8. meta$gene = gene
  9. #可直接使用连续型变量
  10. m = coxph(Surv(time, event) ~ gene, data = meta)
  11. #也可使用二分类变量
  12. #meta$group=ifelse(gene>median(gene),'high','low')
  13. #meta$group = factor(meta$group,levels = c("low","high"))
  14. #m=coxph(Surv(time, event) ~ group, data = meta)
  15. beta <- coef(m)
  16. se <- sqrt(diag(vcov(m)))
  17. HR <- exp(beta)
  18. HRse <- HR * se
  19. #summary(m)
  20. tmp <- round(cbind(coef = beta,
  21. se = se, z = beta/se,
  22. p = 1 - pchisq((beta/se)^2, 1),
  23. HR = HR, HRse = HRse,
  24. HRz = (HR - 1) / HRse,
  25. HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
  26. HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
  27. HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
  28. return(tmp['gene',])
  29. #return(tmp['grouphigh',])#二分类变量
  30. }

生存模型

image.png

Lasso回归

作用:①将重要的基因筛选出来,
②可得出公式,得出参与基因的系数,代入后可得出风险值
image.png

训练集和测试集不同

  1. 1.准备输入数据
  2. rm(list = ls())
  3. proj = "TCGA-KIRC"
  4. load(paste0(proj,"_sur_model.Rdata"))
  5. ls()
  6. #> [1] "exprSet" "meta" "proj"
  7. exprSet[1:4,1:4]
  8. #> TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
  9. #> WASH7P 0.38330266 1.0823971 0.24141414 0.8200756
  10. #> AL627309.6 0.04121953 0.1330091 0.60612548 1.7747381
  11. #> WASH9P 0.95751073 1.8118992 0.66255436 1.7390920
  12. #> AP006222.1 0.15825052 0.2713707 0.08503973 0.5589702
  13. str(meta)
  14. #> 'data.frame': 513 obs. of 7 variables:
  15. #> $ ID : chr "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-B0-4698-01A" ...
  16. #> $ event : int 0 0 1 1 1 0 1 0 1 0 ...
  17. #> $ time : num 1.2 1.27 1.37 1.4 1.43 ...
  18. #> $ race : chr "white" "white" "white" "white" ...
  19. #> $ age : int 62 59 61 75 65 35 72 50 84 76 ...
  20. #> $ gender: chr "male" "male" "male" "male" ...
  21. #> $ stage : chr "II" "I" "IV" "IV" ...
  22. load(paste0(proj,"_logrank_cox_gene.Rdata"))
  23. g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用
  24. exprSet = exprSet[g,]
  25. 2.构建lasso回归模型
  26. 输入数据是表达矩阵(仅含tumor样本)和每个病人对应的生死(顺序必须一致)。
  27. x=t(exprSet)
  28. y=meta$event
  29. library(glmnet)
  30. #> Warning: package 'glmnet' was built under R version 4.1.3
  31. #> Warning: package 'Matrix' was built under R version 4.1.3
  32. 2.1挑选合适的λ值
  33. Lambda 是构建模型的重要参数。他的大小关系着模型选择的基因个数
  34. #调优参数
  35. set.seed(1006)
  36. cv_fit <- cv.glmnet(x=x, y=y)
  37. plot(cv_fit)
  38. #系数图
  39. fit <- glmnet(x=x, y=y)
  40. plot(fit,xvar = "lambda")
  41. 两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。
  42. lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。
  43. 2.2 用这两个λ值重新建模
  44. model_lasso_min <- glmnet(x=x, y=y,lambda=cv_fit$lambda.min)
  45. model_lasso_1se <- glmnet(x=x, y=y,lambda=cv_fit$lambda.1se)
  46. 选中的基因与系数存放于模型的子集beta中,用到的基因有一个s0值,没用的基因只记录了“.”,所以可以用下面代码挑出用到的基因。
  47. head(model_lasso_min$beta,20)
  48. choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
  49. choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
  50. length(choose_gene_min)
  51. #> [1] 41
  52. length(choose_gene_1se)
  53. #> [1] 33
  54. save(choose_gene_min,file = paste0(proj,"_lasso_choose_gene_min.Rdata"))
  55. save(choose_gene_1se,file = paste0(proj,"_lasso_choose_gene_1se.Rdata"))
  56. 3.模型预测和评估
  57. newx参数是预测对象。输出结果lasso.prob是一个矩阵,第一列是min的预测结果,第二列是1se的预测结果,预测结果是概率,或者说百分比,不是绝对的01
  58. 将每个样本的生死和预测结果放在一起,直接cbind即可。
  59. lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
  60. head(lasso.prob)
  61. #> s1 s2
  62. #> TCGA-B8-5162-01A 0.35441206 0.37673630
  63. #> TCGA-B4-5834-01A 0.06975962 0.07579627
  64. #> TCGA-CW-6087-01A 0.72387660 0.64176153
  65. #> TCGA-B0-4698-01A 0.63402199 0.54280316
  66. #> TCGA-B0-4690-01A 0.83592571 0.75407756
  67. #> TCGA-AS-3778-01A -0.13649093 -0.02911398
  68. ROC曲线
  69. library(pROC)
  70. library(ggplot2)
  71. m <- roc(meta$event,lasso.prob[,1])
  72. g <- ggroc(m,legacy.axes = T,size = 1,color = "#2fa1dd")
  73. auc(m)
  74. #> Area under the curve: 0.8687
  75. g + theme_bw() +
  76. geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1),
  77. colour = "grey", linetype = "dashed")+
  78. annotate("text",x = .75, y = .25,
  79. label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")
  80. 计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。
  81. 两个模型的曲线画在一起
  82. m2 <- roc(meta$event, lasso.prob[,2])
  83. auc(m2)
  84. #> Area under the curve: 0.8507
  85. g <- ggroc(list(min = m,se = m2),legacy.axes = T,size = 1)
  86. g + theme_bw() +
  87. scale_color_manual(values = c("#2fa1dd", "#f87669"))+
  88. geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1),
  89. colour = "grey", linetype = "dashed")+
  90. annotate("text",x = .75, y = .25,
  91. label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")+
  92. annotate("text",x = .75, y = .15,
  93. label = paste("AUC of 1se = ",format(round(as.numeric(auc(m2)),2),nsmall = 2)),color = "#f87669")
  94. 5.切割数据构建模型并预测
  95. 5.1 切割数据(数据>500
  96. Rcaret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
  97. library(caret)
  98. #> Warning: package 'caret' was built under R version 4.1.3
  99. set.seed(12345679)
  100. sam<- createDataPartition(meta$event, p = .5,list = FALSE)
  101. head(sam)
  102. #> Resample1
  103. #> [1,] 5
  104. #> [2,] 9
  105. #> [3,] 13
  106. #> [4,] 17
  107. #> [5,] 19
  108. #> [6,] 21
  109. 可查看两组一些临床参数切割比例
  110. train <- exprSet[,sam]
  111. test <- exprSet[,-sam]
  112. train_meta <- meta[sam,]
  113. test_meta <- meta[-sam,]
  114. prop.table(table(train_meta$stage))
  115. #>
  116. #> I II III IV
  117. #> 0.5058824 0.1019608 0.2117647 0.1803922
  118. prop.table(table(test_meta$stage))
  119. #>
  120. #> I II III IV
  121. #> 0.4941176 0.1176471 0.2470588 0.1411765
  122. prop.table(table(test_meta$race))
  123. #>
  124. #> asian black or african american white
  125. #> 0.01587302 0.09920635 0.88492063
  126. prop.table(table(train_meta$race))
  127. #>
  128. #> asian black or african american white
  129. #> 0.01574803 0.10236220 0.88188976
  130. 5.2 切割后的train数据集建模
  131. 和上面的建模方法一样。
  132. #计算lambda
  133. x = t(train)
  134. y = train_meta$event
  135. cv_fit <- cv.glmnet(x=x, y=y)
  136. plot(cv_fit)
  137. #构建模型
  138. model_lasso_min <- glmnet(x=x, y=y,lambda=cv_fit$lambda.min)
  139. model_lasso_1se <- glmnet(x=x, y=y,lambda=cv_fit$lambda.1se)
  140. #挑出基因
  141. head(model_lasso_min$beta)
  142. #> 6 x 1 sparse Matrix of class "dgCMatrix"
  143. #> s0
  144. #> WASH7P .
  145. #> AL627309.6 .
  146. #> WASH9P .
  147. #> MTATP6P1 .
  148. #> LINC01409 .
  149. #> LINC00115 .
  150. choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
  151. choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
  152. length(choose_gene_min)
  153. #> [1] 31
  154. length(choose_gene_1se)
  155. #> [1] 21
  156. 4.模型预测
  157. 用训练集构建模型,预测测试集的生死,注意newx参数变了。
  158. lasso.prob <- predict(cv_fit, newx=t(test), s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
  159. head(lasso.prob)
  160. #> s1 s2
  161. #> TCGA-B8-5162-01A 0.39653904 0.43398261
  162. #> TCGA-B4-5834-01A 0.10082314 0.16288716
  163. #> TCGA-CW-6087-01A 0.69858720 0.63793147
  164. #> TCGA-B0-4698-01A 0.52026571 0.54582571
  165. #> TCGA-AS-3778-01A -0.02908933 0.07121892
  166. #> TCGA-B2-4098-01A 0.38169229 0.35982935
  167. 再画ROC曲线
  168. library(pROC)
  169. library(ggplot2)
  170. m <- roc(test_meta$event, lasso.prob[,1])
  171. g <- ggroc(m,legacy.axes = T,size = 1,color = "#2fa1dd")
  172. auc(m)
  173. #> Area under the curve: 0.7696
  174. g + theme_bw() +
  175. geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1),
  176. colour = "grey", linetype = "dashed")+
  177. annotate("text",x = .75, y = .25,
  178. label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")
  179. 计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。
  180. 两个模型的曲线画在一起
  181. m2 <- roc(test_meta$event, lasso.prob[,2])
  182. auc(m2)
  183. #> Area under the curve: 0.7569
  184. g <- ggroc(list(min = m,se = m2),legacy.axes = T,size = 1)
  185. g + theme_bw() +
  186. scale_color_manual(values = c("#2fa1dd", "#f87669"))+
  187. geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1),
  188. colour = "grey", linetype = "dashed")+
  189. annotate("text",x = .75, y = .25,
  190. label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")+
  191. annotate("text",x = .75, y = .15,
  192. label = paste("AUC of 1se = ",format(round(as.numeric(auc(m2)),2),nsmall = 2)),color = "#f87669")

用自己的数据进行逻辑回归- lasso分析
导入数据格式
image.png

  1. rm(list = ls())
  2. library(glmnet)
  3. library(readxl)
  4. library(plyr)
  5. library(caret)
  6. library(corrplot)
  7. library(ggplot2)
  8. library(Hmisc)
  9. library(openxlsx)
  10. data <- read.xlsx("/Users/mac/Desktop/r/data1.xlsx")
  11. x <- as.matrix(data[,-c(1:2)])
  12. y <- as.double(data$Group)
  13. fit <- glmnet(x,y,family = "binomial",nlambda = 1000,alpha = 1)
  14. print(fit)
  15. plot(fit,xvar = "lambda")
  16. #筛选最合适的变量
  17. lasso_fit <- cv.glmnet(x,y,family="binomial",alpha = 1,type.measure = "auc",nlambda = 1000)
  18. plot(lasso_fit)
  19. print(lasso_fit)
  20. #使用lambda.lse 获取的模型较好
  21. lasso_best <- glmnet(x=x,y=y,alpha = 1,lambda = lasso_fit$lambda.1se)
  22. coef(lasso_best)
  23. coefficient <- coef(lasso_best,s=lasso_best$lambda.lse)
  24. coe <- coefficient@x
  25. coe <- as.data.frame(coe)
  26. Active_Index <- which(as.numeric(coefficient)!=0)
  27. active_coefficients <- as.numeric(coefficient)[Active_Index]
  28. variable <- rownames(coefficient)[Active_Index]
  29. variable <- as.data.frame(variable)
  30. variable <-cbind(variable,coe)
  31. #使用lambda.min 适用于样本质量较差,纳入模型中的变量较多
  32. lasso_best <- glmnet(x=x,y=y,alpha = 1,lambda = lasso_fit$lambda.min)
  33. coef(lasso_best)
  34. coefficient <- coef(lasso_best,s=lasso_best$lambda.min)
  35. coe <- coefficient@x
  36. coe <- as.data.frame(coe)
  37. Active_Index <- which(as.numeric(coefficient)!=0)
  38. active_coefficients <- as.numeric(coefficient)[Active_Index]
  39. variable <- rownames(coefficient)[Active_Index]
  40. variable <- as.data.frame(variable)
  41. variable <-cbind(variable,coe)

timeROC

image.png
image.png

cox-forest

  1. 1.准备输入数据
  2. rm(list = ls())
  3. proj = "TCGA-KIRC"
  4. if(!require(My.stepwise))install.packages("My.stepwise")
  5. #> Warning: package 'My.stepwise' was built under R version 4.1.3
  6. load(paste0(proj,"_sur_model.Rdata"))
  7. load(paste0(proj,"_lasso_choose_gene_1se.Rdata"))
  8. g = choose_gene_1se
  9. 2.构建coxph模型
  10. 将用于建模的基因(例如lasso回归选中的基因)从表达矩阵中取出来,,可作为列添加在meta表噶的后面,组成的数据框赋值给dat
  11. library(stringr)
  12. e=t(exprSet[g,])
  13. colnames(e)= str_replace_all(colnames(e),"-","_")
  14. dat=cbind(meta,e)
  15. dat$gender=as.numeric(factor(dat$gender))
  16. dat$stage=as.numeric(factor(dat$stage))
  17. colnames(dat)
  18. #> [1] "ID" "event" "time" "race" "age"
  19. #> [6] "gender" "stage" "AL391244.2" "AJAP1" "AL357140.2"
  20. #> [11] "CPLANE2" "RCC2" "AL137127.1" "NIPAL3" "KDF1"
  21. #> [16] "LINC01389" "ANGPTL3" "GADD45A" "LINC01725" "LINC02609"
  22. #> [21] "CHI3L2" "LINC01719" "KCNN3" "MSTO1" "PMF1_BGLAP"
  23. #> [26] "CRABP2" "CD1E" "FCER1A" "NUF2" "DUTP6"
  24. #> [31] "IGFN1" "NEK2" "PROX1" "DUSP5P1" "HS1BP3_IT1"
  25. #> [36] "OTOF" "CENPA" "SLC5A6" "PRKCE" "SPTBN1"
  26. 逐步回归法构建最优模型
  27. 输出结果行数太多,所以我注释掉了
  28. library(survival)
  29. #> Warning: package 'survival' was built under R version 4.1.3
  30. library(survminer)
  31. # 不能允许缺失值
  32. dat2 = na.omit(dat)
  33. library(My.stepwise)
  34. vl <- colnames(dat)[c(5:ncol(dat))]
  35. # My.stepwise.coxph(Time = "time",
  36. # Status = "event",
  37. # variable.list = vl,
  38. # data = dat2)
  39. 使用输出结果里的最后一个模型
  40. model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + AL391244.2 +
  41. age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 +
  42. LINC01719 + DUTP6, data = dat2)
  43. 3.模型可视化-森林图
  44. ggforest(model,data = dat2)
  45. 4.模型预测
  46. fp <- predict(model,newdata = dat2)
  47. library(Hmisc)
  48. #> Warning: package 'Hmisc' was built under R version 4.1.3
  49. options(scipen=200)
  50. with(dat2,rcorr.cens(fp,Surv(time, event))) #with函数下可将列名当作变量用
  51. #> C Index Dxy S.D. n missing
  52. #> 0.17850596(实为1- C Index 的值,故须换算) -0.64298808 0.02857663 503.00000000 0.00000000
  53. #> uncensored Relevant Pairs Concordant Uncertain
  54. #> 166.00000000 105834.00000000 18892.00000000 146654.00000000
  55. C-index用于计算生存分析中的COX模型预测值与真实之间的区分度(discrimination),也称为Harrells concordanceindexC-index0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。
  56. 5.切割数据构建模型并预测
  57. 5.1 切割数据
  58. Rcaret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
  59. library(caret)
  60. #> Warning: package 'caret' was built under R version 4.1.3
  61. set.seed(12345679)
  62. sam<- createDataPartition(meta$event, p = .5,list = FALSE)
  63. train <- exprSet[,sam]
  64. test <- exprSet[,-sam]
  65. train_meta <- meta[sam,]
  66. test_meta <- meta[-sam,]
  67. 5.2 切割后的train数据集建模
  68. 和上面的建模方法一样。
  69. e=t(train[g,])
  70. colnames(e)= str_replace_all(colnames(e),"-","_")
  71. dat=cbind(train_meta,e)
  72. dat$gender=as.numeric(factor(dat$gender))
  73. dat$stage=as.numeric(factor(dat$stage))
  74. colnames(dat)
  75. #> [1] "ID" "event" "time" "race" "age"
  76. #> [6] "gender" "stage" "AL391244.2" "AJAP1" "AL357140.2"
  77. #> [11] "CPLANE2" "RCC2" "AL137127.1" "NIPAL3" "KDF1"
  78. #> [16] "LINC01389" "ANGPTL3" "GADD45A" "LINC01725" "LINC02609"
  79. #> [21] "CHI3L2" "LINC01719" "KCNN3" "MSTO1" "PMF1_BGLAP"
  80. #> [26] "CRABP2" "CD1E" "FCER1A" "NUF2" "DUTP6"
  81. #> [31] "IGFN1" "NEK2" "PROX1" "DUSP5P1" "HS1BP3_IT1"
  82. #> [36] "OTOF" "CENPA" "SLC5A6" "PRKCE" "SPTBN1"
  83. library(My.stepwise)
  84. dat2 = na.omit(dat)
  85. vl <- colnames(dat2)[c(5:ncol(dat2))]
  86. # My.stepwise.coxph(Time = "time",
  87. # Status = "event",
  88. # variable.list = vl,
  89. # data = dat2)
  90. model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + IGFN1 +
  91. AJAP1 + ANGPTL3 + SLC5A6 + HS1BP3_IT1 + AL137127.1 + LINC01719 +
  92. CRABP2, data = dat2)
  93. 5.3 模型可视化
  94. ggforest(model, data =dat2)
  95. 5.4 用切割后的数据test数据集验证模型
  96. e=t(test[g,])
  97. colnames(e)= str_replace_all(colnames(e),"-","_")
  98. test_dat=cbind(test_meta,e)
  99. test_dat$gender=as.numeric(factor(test_dat$gender))
  100. test_dat$stage=as.numeric(factor(test_dat$stage))
  101. fp <- predict(model,newdata = test_dat)
  102. library(Hmisc)
  103. with(test_dat,rcorr.cens(fp,Surv(time, event)))
  104. #> C Index Dxy S.D. n missing
  105. #> 0.22187105 -0.55625790 0.04736798 255.00000000 1.00000000
  106. #> uncensored Relevant Pairs Concordant Uncertain
  107. #> 86.00000000 28476.00000000 6318.00000000 36294.00000000