image.png

随机森林模型

  1. randomforest
  2. 生信技能树
  3. 1.准备输入数据
  4. 输入数据是肿瘤样本表达矩阵exprSet、临床信息meta
  5. proj = "TCGA-KIRC"
  6. load(paste0(proj,"_sur_model.Rdata"))
  7. load(paste0(proj,"_logrank_cox_gene.Rdata"))
  8. g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用
  9. exprSet = exprSet[g,]
  10. 数据量太大需要运行很久,这里再缩小一点范围,用自己的数据时删掉这段代码
  11. exprSet = exprSet[1:1000,1:100]
  12. meta = meta[1:100,]
  13. library(randomForest)
  14. library(ROCR)
  15. library(genefilter)
  16. library(Hmisc)
  17. 2.构建随机森林模型
  18. 输入数据是表达矩阵(仅含tumor样本)和对应的生死。
  19. x=t(exprSet)
  20. y=meta$event
  21. #构建模型,一个叫randomForest的函数,运行时间很长,存Rdata跳过
  22. tmp_rf=paste0(proj,"_rf_output.Rdata")
  23. if(!file.exists(tmp_rf)){
  24. rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001,
  25. proximity=TRUE )
  26. save(rf_output,file = tmp_rf)
  27. }
  28. load(file = tmp_rf)
  29. #top30的基因
  30. varImpPlot(rf_output, type=2, n.var=30, scale=FALSE,
  31. main="Variable Importance (Gini) for top 30 predictors",cex =.7)
  32. rf_importances=importance(rf_output, scale=FALSE)
  33. head(rf_importances)
  34. #> %IncMSE IncNodePurity
  35. #> WASH7P 1.081871e-04 4.732143e-03
  36. #> AL627309.6 -4.949768e-05 7.355461e-02
  37. #> WASH9P -1.935604e-04 2.077797e-02
  38. #> MTATP6P1 0.000000e+00 3.552714e-18
  39. #> LINC01409 9.696970e-05 6.891663e-03
  40. #> LINC00115 -5.882353e-05 9.679104e-03
  41. #解释量top30的基因,和图上是一样的,从下往上看。
  42. choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))
  43. 3.模型预测和评估
  44. rf.prob <- predict(rf_output, x)
  45. re=cbind(y ,rf.prob)
  46. head(re)
  47. #> y rf.prob
  48. #> TCGA-B8-5162-01A 0 0.2658333
  49. #> TCGA-B4-5834-01A 0 0.0769000
  50. #> TCGA-CW-6087-01A 1 0.8897333
  51. #> TCGA-B0-4698-01A 1 0.9392000
  52. #> TCGA-B0-4690-01A 1 0.9759000
  53. #> TCGA-AS-3778-01A 0 0.1081333
  54. ROC曲线
  55. library(ROCR)
  56. #library(caret)
  57. pred <- prediction(re[,2], re[,1])
  58. auc = performance(pred,"auc")@y.values[[1]]
  59. auc
  60. #> [1] 1
  61. 4.切割数据构建模型并预测
  62. 4.1 切割数据
  63. Rcaret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
  64. library(caret)
  65. set.seed(12345679)
  66. sam<- createDataPartition(meta$event, p = .5,list = FALSE)
  67. train <- exprSet[,sam]
  68. test <- exprSet[,-sam]
  69. train_meta <- meta[sam,]
  70. test_meta <- meta[-sam,]
  71. 4.2 切割后的train数据集建模
  72. 和上面的建模方法一样。
  73. #建模
  74. x = t(train)
  75. y = train_meta$event
  76. tmp_rf=paste0(proj,"_t_rf_output.Rdata")
  77. if(!file.exists(tmp_rf)){
  78. rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001,
  79. proximity=TRUE )
  80. save(rf_output,file = tmp_rf)
  81. }
  82. load(file = tmp_rf)
  83. choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))
  84. head(choose_gene)
  85. #> [1] "PUSL1" "CFAP45" "ISG15" "CGN" "IBA57"
  86. #> [6] "AC104794.5"
  87. 5.模型预测
  88. 用训练集构建模型,预测测试集的生死。
  89. x = t(test)
  90. y = test_meta$event
  91. rf.prob <- predict(rf_output, x)
  92. re=cbind(y ,rf.prob)
  93. re=as.data.frame(re)
  94. colnames(re)=c('event','prob')
  95. re$event=as.factor(re$event)
  96. 再看AUC值。
  97. library(ROCR)
  98. # 训练集模型预测测试集
  99. pred <- prediction(re[,2], re[,1])
  100. auc= performance(pred,"auc")@y.values[[1]]
  101. auc
  102. #> [1] 0.8181818
  103. 还行。

支持向量机模型

  1. svm
  2. 生信技能树
  3. 1.准备输入数据
  4. proj = "TCGA-KIRC"
  5. load(paste0(proj,"_sur_model.Rdata"))
  6. library(ROCR)
  7. library(genefilter)
  8. library(Hmisc)
  9. #> Warning: package 'Hmisc' was built under R version 4.1.3
  10. #> Warning: package 'survival' was built under R version 4.1.3
  11. library(e1071)
  12. 2.构建支持向量机模型
  13. 2.1.切割数据
  14. Rcaret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
  15. library(caret)
  16. #> Warning: package 'caret' was built under R version 4.1.3
  17. set.seed(12345679)
  18. sam<- createDataPartition(meta$event, p = .5,list = FALSE)
  19. train <- exprSet[,sam]
  20. test <- exprSet[,-sam]
  21. train_meta <- meta[sam,]
  22. test_meta <- meta[-sam,]
  23. 2.2 train数据集建模
  24. x=t(train)
  25. y=as.factor(train_meta$event)
  26. model = svm(x,y,kernel = "linear")
  27. summary(model)
  28. #>
  29. #> Call:
  30. #> svm.default(x = x, y = y, kernel = "linear")
  31. #>
  32. #>
  33. #> Parameters:
  34. #> SVM-Type: C-classification
  35. #> SVM-Kernel: linear
  36. #> cost: 1
  37. #>
  38. #> Number of Support Vectors: 221
  39. #>
  40. #> ( 75 146 )
  41. #>
  42. #>
  43. #> Number of Classes: 2
  44. #>
  45. #> Levels:
  46. #> 0 1
  47. 2.3.模型预测
  48. 用训练集构建模型,预测测试集的生死。不同于其他模型,这个预测结果是分类变量,直接预测生死,而不是prob
  49. x=t(test)
  50. y=as.factor(test_meta$event)
  51. pred = predict(model, x)
  52. table(pred,y)
  53. #> y
  54. #> pred 0 1
  55. #> 0 153 44
  56. #> 1 17 42

image.png

time-ROC曲线

  1. timeROC
  2. 生信技能树
  3. 1.构建模型并完成预测
  4. load("TCGA-KIRC_sur_model.Rdata")
  5. load("TCGA-KIRC_logrank_cox_gene.Rdata")
  6. g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用
  7. exprSet = exprSet[g,]
  8. x=t(exprSet)
  9. y=meta$event
  10. library(glmnet)
  11. #> Warning: package 'glmnet' was built under R version 4.1.3
  12. #> Warning: package 'Matrix' was built under R version 4.1.3
  13. cv_fit <- cv.glmnet(x=x, y=y)
  14. model_lasso_min <- glmnet(x=x, y=y, lambda=cv_fit$lambda.min)
  15. choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
  16. lasso.prob <- predict(cv_fit, newx=x , s=cv_fit$lambda.min )
  17. head(lasso.prob)
  18. #> s1
  19. #> TCGA-B8-5162-01A 0.35258903
  20. #> TCGA-B4-5834-01A 0.06925658
  21. #> TCGA-CW-6087-01A 0.70950202
  22. #> TCGA-B0-4698-01A 0.61516034
  23. #> TCGA-B0-4690-01A 0.82493174
  24. #> TCGA-AS-3778-01A -0.11996394
  25. dat = cbind(meta[,1:3],
  26. fp = lasso.prob[,1])
  27. head(dat)
  28. #> ID event time fp
  29. #> TCGA-B8-5162-01A TCGA-B8-5162-01A 0 1.200000 0.35258903
  30. #> TCGA-B4-5834-01A TCGA-B4-5834-01A 0 1.266667 0.06925658
  31. #> TCGA-CW-6087-01A TCGA-CW-6087-01A 1 1.366667 0.70950202
  32. #> TCGA-B0-4698-01A TCGA-B0-4698-01A 1 1.400000 0.61516034
  33. #> TCGA-B0-4690-01A TCGA-B0-4690-01A 1 1.433333 0.82493174
  34. #> TCGA-AS-3778-01A TCGA-AS-3778-01A 0 1.433333 -0.11996394
  35. 2 time-ROC计算和出图
  36. library(survminer)
  37. library(survival)
  38. #> Warning: package 'survival' was built under R version 4.1.3
  39. library(timeROC)
  40. result <-with(dat, timeROC(T=time,
  41. delta=event,
  42. marker=fp,
  43. cause=1,
  44. times=c(12,36,96),
  45. iid = TRUE))
  46. plot_dat = data.frame(fpr = as.numeric(result$FP),
  47. tpr = as.numeric(result$TP),
  48. time = rep(as.factor(c(12,36,96)),each = nrow(result$TP)))
  49. library(ggplot2)
  50. ggplot() +
  51. geom_line(data = plot_dat,aes(x = fpr, y = tpr,color = time),size = 1) +
  52. scale_color_manual(name = NULL,values = c("#2874C5", "#f87669", "#e6b707"),
  53. labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
  54. format(round(result$AUC,2),nsmall = 2)))+
  55. geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
  56. theme_bw()+
  57. theme(panel.grid = element_blank(),
  58. legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
  59. legend.position = c(0.765,0.125))+
  60. scale_x_continuous(expand = c(0.005,0.005))+
  61. scale_y_continuous(expand = c(0.005,0.005))+
  62. labs(x = "1 - Specificity",
  63. y = "Sensitivity")+
  64. coord_fixed()

风险因子三图联动

  1. 三图联动
  2. 生信技能树
  3. 1.构建cox模型并预测
  4. proj = "TCGA-KIRC"
  5. load(paste0(proj,"_sur_model.Rdata"))
  6. load(paste0(proj,"_lasso_choose_gene_1se.Rdata"))
  7. library(survival)
  8. library(survminer)
  9. library(ggplotify)
  10. library(cowplot)
  11. library(Hmisc)
  12. options(scipen=200)
  13. library(pheatmap)
  14. library(stringr)
  15. e=t(exprSet[choose_gene_1se,])
  16. colnames(e)= str_replace_all(colnames(e),"-","_")
  17. dat=cbind(meta,e)
  18. dat$gender=as.numeric(factor(dat$gender))
  19. dat$stage=as.numeric(factor(dat$stage))
  20. colnames(dat)
  21. #> [1] "ID" "event" "time" "race" "age"
  22. #> [6] "gender" "stage" "AL391244.2" "AJAP1" "AL357140.2"
  23. #> [11] "CPLANE2" "RCC2" "AL137127.1" "NIPAL3" "KDF1"
  24. #> [16] "LINC01389" "ANGPTL3" "GADD45A" "LINC01725" "LINC02609"
  25. #> [21] "CHI3L2" "LINC01719" "KCNN3" "MSTO1" "PMF1_BGLAP"
  26. #> [26] "CRABP2" "CD1E" "FCER1A" "NUF2" "DUTP6"
  27. #> [31] "IGFN1" "NEK2" "PROX1" "DUSP5P1" "HS1BP3_IT1"
  28. #> [36] "OTOF" "CENPA" "SLC5A6" "PRKCE" "SPTBN1"
  29. library(survival)
  30. library(survminer)
  31. dat2 = na.omit(dat)
  32. model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + AL391244.2 +
  33. age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 +
  34. LINC01719 + DUTP6, data = dat2)
  35. ggforest(model, data =dat2)
  36. fp <- predict(model,dat2)
  37. head(fp)
  38. #> TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
  39. #> -0.2267798 -1.2751770 2.0868264 2.6742892
  40. #> TCGA-B0-4690-01A TCGA-AS-3778-01A
  41. #> 2.4891564 -2.3198774
  42. 2.划分高低风险并画生存分析图
  43. names(fp) = dat2$ID
  44. ri = ifelse(fp<median(fp),"lowrisk","highrisk")
  45. ri = factor(ri,levels = c("lowrisk","highrisk"))
  46. sfit <- survfit(Surv(time, event)~ri, data=dat2)
  47. ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
  48. risk.table =TRUE,pval =TRUE,
  49. conf.int =TRUE,xlab ="Time in months",
  50. ggtheme =theme_light())
  51. 3.风险因子三图联动
  52. fp_dat=data.frame(patientid=1:length(fp),
  53. fp=as.numeric(sort(fp)),
  54. ri= ri[order(fp)])
  55. sur_dat=data.frame(patientid=1:length(fp),
  56. time=dat2[order(fp),'time'] ,
  57. event=dat2[order(fp),'event'])
  58. sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
  59. sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
  60. library(stringr)
  61. gs = str_split("stage + AL357140.2 + AL391244.2 + age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 + LINC01719 + DUTP6"," \\+ ")[[1]]
  62. gs = gs[gs %in% rownames(exprSet)]
  63. exp_dat = t(dat2[order(fp),gs])
  64. colnames(exp_dat) = str_sub(colnames(exp_dat),1,12)
  65. rownames(exp_dat) = str_replace_all(rownames(exp_dat),"_","-")
  66. ###第一个图----
  67. p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
  68. geom_point(aes(color = ri))+
  69. scale_color_manual(values = c("#2874C5", "#f87669"))+
  70. geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+
  71. scale_x_continuous(expand=c(0.01,0))+
  72. labs(x = "",y = "risk score")+
  73. theme_bw()
  74. #第二个图
  75. p2=ggplot(sur_dat,aes(x=patientid,y=time))+
  76. geom_point(aes(col=event))+
  77. scale_color_manual(values = c("#f87669","#2874C5"))+
  78. geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+
  79. scale_x_continuous(expand=c(0.01,0))+
  80. labs(x = "")+
  81. theme_bw()
  82. #第三个图
  83. annotation_col = data.frame(group = ri,
  84. row.names = names(ri))
  85. mycolors <- colorRampPalette(c("#2874C5","white", "#f87669"), bias = 1.2)(100)
  86. ann_colors = list(
  87. group = c(lowrisk="#2874C5", highrisk="#f87669")
  88. )
  89. p3=pheatmap::pheatmap(exp_dat,
  90. col= mycolors,
  91. annotation_col = annotation_col,
  92. annotation_colors =ann_colors,
  93. scale = "row",
  94. breaks = seq(-3,3,length.out = 100),
  95. show_colnames = F,
  96. cluster_cols = F,
  97. annotation_legend = F)
  98. #拼图实现三图联动
  99. library(patchwork)
  100. p1/p2/as.ggplot(p3)
  101. # 暂时的补救办法
  102. library(tinyarray)
  103. n = scale(t(exp_dat))
  104. n[n > 3] = 3
  105. n[n < -3] = -3
  106. p3 = ggheat(n,fp_dat$ri,show_rownames = F)+
  107. theme(axis.text = element_text(size = 8))
  108. p3
  109. p1 /p2 /p3 + plot_layout(design = 'A
  110. B
  111. C
  112. C
  113. C
  114. C')
  115. ggsave("risk_plot.png",height = 10,width = 10)
  116. 从上向下三个图分别表示:
  117. 1.每个病人的预测值,按照从小到大排序 2.每个病人的生存时间,颜色区分生死 3.热图,挑出的基因在每个样本中的表达量

maf文件下载和整理

  1. maftools
  2. 1.数据获取
  3. 使用github上的RTCGAmutations来获取数据
  4. library(TCGAmutations)
  5. library(maftools)
  6. library(dplyr)
  7. proj='TCGA-KIRC'
  8. laml = tcga_load(study = "KIRC")
  9. laml
  10. #> An object of class MAF
  11. #> ID summary Mean Median
  12. #> 1: NCBI_Build NA NA NA
  13. #> 2: Center NA NA NA
  14. #> 3: Samples 370 NA NA
  15. #> 4: nGenes 10243 NA NA
  16. #> 5: Frame_Shift_Del 1792 4.843 4
  17. #> 6: Frame_Shift_Ins 463 1.251 1
  18. #> 7: In_Frame_Del 270 0.730 0
  19. #> 8: In_Frame_Ins 26 0.070 0
  20. #> 9: Missense_Mutation 17050 46.081 42
  21. #> 10: Nonsense_Mutation 1297 3.505 3
  22. #> 11: Nonstop_Mutation 33 0.089 0
  23. #> 12: Splice_Site 672 1.816 1
  24. #> 13: Translation_Start_Site 34 0.092 0
  25. #> 14: total 21637 58.478 53
  26. length(unique(laml@data$Tumor_Sample_Barcode))
  27. #> [1] 370
  28. length(unique(laml@data$Hugo_Symbol))
  29. #> [1] 10243
  30. 370个病人,10243个突变基因的信息。
  31. 3.突变数据的可视化
  32. 3.1 plotmafSummary
  33. maftools 自带可视化函数plotmafSummary,可以比较直观的统计maf文件的数据。
  34. #if (as.numeric(dev.cur()) != 1) graphics.off()
  35. plotmafSummary(maf = laml, rmOutlier = TRUE,
  36. #showBarcodes = FALSE,
  37. addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
  38. 就是将laml@data数据框做了统计,用barplotboxplot做了可视化。
  39. 3.2 突变频谱图
  40. 可以选择突变数量前30的基因,也可以指定感兴趣的几个基因
  41. oncoplot(maf = laml, top = 30, fontSize = 0.7)
  42. g = c( "RBM43", "VHL", "SETD2", "PBRM1")
  43. oncoplot(maf = laml,genes = g, fontSize = 0.7)
  44. 3.2 可以给oncoplot加上一些临床信息
  45. pd 是临床信息数据框,第一列是Tumor_Sample_Barcode,后面几列是各种分组,如genderage,stage等等。
  46. colnames(laml@clinical.data)[57] = "stage"
  47. oncoplot(maf = laml,
  48. sortByAnnotation = TRUE,
  49. clinicalFeatures = c("stage",
  50. 'gender')
  51. )
  52. 自定义颜色
  53. col = RColorBrewer::brewer.pal(n = 8,name = 'Set3')
  54. stagecolors = col[1:4]
  55. names(stagecolors) = na.omit(unique(laml@clinical.data$stage))
  56. gendercolors = col[5:6]
  57. names(gendercolors) = unique(laml@clinical.data$gender)
  58. ancolors = list(stage = stagecolors,
  59. gender = gendercolors)
  60. oncoplot(maf = laml,
  61. sortByAnnotation = TRUE,
  62. clinicalFeatures = c("stage",
  63. 'gender'),
  64. annotationColor = ancolors,
  65. )

突变signature

  1. signafiture
  2. 0.R
  3. rm(list=ls())
  4. proj='TCGA-KIRC'
  5. library(maftools)
  6. library(dplyr)
  7. library(pheatmap)
  8. library(ComplexHeatmap)
  9. library(circlize)
  10. library(stringr)
  11. library(deconstructSigs)
  12. library(BSgenome)
  13. library(BSgenome.Hsapiens.UCSC.hg19)
  14. 1.读取突变数据maf
  15. 是和上一篇一样的数据,从gdc下载的maf文件和临床信息
  16. library(TCGAmutations)
  17. library(maftools)
  18. library(dplyr)
  19. proj='TCGA-KIRC'
  20. laml = tcga_load(study = "KIRC")
  21. laml
  22. #> An object of class MAF
  23. #> ID summary Mean Median
  24. #> 1: NCBI_Build NA NA NA
  25. #> 2: Center NA NA NA
  26. #> 3: Samples 370 NA NA
  27. #> 4: nGenes 10243 NA NA
  28. #> 5: Frame_Shift_Del 1792 4.843 4
  29. #> 6: Frame_Shift_Ins 463 1.251 1
  30. #> 7: In_Frame_Del 270 0.730 0
  31. #> 8: In_Frame_Ins 26 0.070 0
  32. #> 9: Missense_Mutation 17050 46.081 42
  33. #> 10: Nonsense_Mutation 1297 3.505 3
  34. #> 11: Nonstop_Mutation 33 0.089 0
  35. #> 12: Splice_Site 672 1.816 1
  36. #> 13: Translation_Start_Site 34 0.092 0
  37. #> 14: total 21637 58.478 53
  38. mut=laml@data
  39. mut[1:4,1:2]
  40. #> Hugo_Symbol Chromosome
  41. #> 1: NMT2 10
  42. #> 2: ARHGAP42 11
  43. #> 3: RAG1 11
  44. #> 4: NLRP14 11
  45. mut=mut[mut$Variant_Type=='SNP',]
  46. #每行记录了一个突变,所以统计样本列的行数即为样本的突变数量
  47. plot(table(mut[,16]),las = 2)
  48. 2.制作signatures的输入数据(96频谱)
  49. 关于什么是mutation
  50. signaturehttp://www.bio-info-trainee.com/1619.html
  51. 关于96频谱:http://www.biotrainee.com/thread-832-1-1.html
  52. a = mut[,c(16,2,3,5,6)]
  53. colnames(a)=c( "Sample","chr", "pos","ref", "alt")
  54. a$Sample=as.character(a$Sample)
  55. a$chr = paste0("chr",a$chr)
  56. sigs.input <- mut.to.sigs.input(mut.ref = a,
  57. sample.id = "Sample",
  58. chr = "chr",
  59. pos = "pos",
  60. ref = "ref",
  61. alt = "alt",
  62. bsg = BSgenome.Hsapiens.UCSC.hg19)
  63. class(sigs.input)
  64. #> [1] "data.frame"
  65. #第一个样本的突变绘图
  66. barplot(as.numeric(sigs.input[1,])~colnames(sigs.input))
  67. sigs.input[1:4,1:4]
  68. #> A[C>A]A A[C>A]C A[C>A]G A[C>A]T
  69. #> TCGA-3Z-A93Z-01A-11D-A36X-10 0 0 0 0
  70. #> TCGA-6D-AA2E-01A-11D-A36X-10 0 0 0 0
  71. #> TCGA-A3-3308-01A-01D-0966-08 0 0 0 0
  72. #> TCGA-A3-3311-01A-01D-0966-08 0 0 0 0
  73. 一顿操作猛如虎,生成signature与样本对应关系的矩阵
  74. if(!file.exists(paste0(proj,"_w.Rdata"))){
  75. w=lapply(unique(a$Sample), function(i){
  76. ## signatures.cosmic signatures.nature2013
  77. sample_1 = whichSignatures(tumor.ref = sigs.input,
  78. signatures.ref = signatures.cosmic,
  79. sample.id = i,
  80. contexts.needed = TRUE,
  81. tri.counts.method = 'default')
  82. print(c(i,which(unique(a$Sample)==i)))
  83. return(sample_1$weights)
  84. })
  85. w=do.call(rbind,w)
  86. save(w,file = paste0(proj,"_w.Rdata"))
  87. }
  88. load(paste0(proj,"_w.Rdata"))
  89. mat = t(w)
  90. 3.可视化
  91. 矩阵的可视化–热图
  92. pheatmap::pheatmap(mat,
  93. cluster_rows = F,
  94. cluster_cols = T,
  95. show_colnames = F)

任意基因的任意分组比较

  1. 任意基因的任意分组比较
  2. 0.输入数据
  3. rm(list=ls())
  4. proj = "TCGA-KIRC"
  5. f = "explore.Rdata"
  6. if(!file.exists(f)){
  7. load(paste0(proj,".Rdata"))
  8. load(paste0(proj,"_sur_model.Rdata"))
  9. library(TCGAmutations)
  10. library(maftools)
  11. laml = tcga_load(study = "KIRC")
  12. mut = laml@data
  13. save(exp,Group,exprSet,meta,mut,file = f)
  14. }
  15. load(f)
  16. 这里面的数据:
  17. exptumor-normal都有的表达矩阵,exprSet是只有tumor样本的表达矩阵。meta是临床信息表格,Grouptumor-normal分组信息。mut是突变信息,由maf文件读取并取子集得到。
  18. 1.比较任意基因在tumornormal样本中的表达量
  19. PBRM1为例画图,可替换为其他任意基因。
  20. table(Group)
  21. #> Group
  22. #> normal tumor
  23. #> 72 535
  24. library(ggstatsplot)
  25. exp = log2(edgeR::cpm(exp)+1)
  26. dat = data.frame(gene = exp["PBRM1",],
  27. group = Group)
  28. ggbetweenstats(data = dat, x = group, y = gene,title = "PBRM1")
  29. 2.任意基因在任意两个分组中的表达量对比
  30. 只要是可以根据临床信息查到或得到的分组,例如生死、人种、分期,都可以拿来做分组。
  31. 需要注意调整样本顺序,一一对应。
  32. #按照生死、人种、分期分组看看
  33. table(meta$event)
  34. #>
  35. #> 0 1
  36. #> 344 169
  37. table(meta$stage)
  38. #>
  39. #> I II III IV
  40. #> 255 56 117 82
  41. table(meta$race)
  42. #>
  43. #> asian black or african american white
  44. #> 8 51 447
  45. dat = data.frame(gene = exprSet["PBRM1",],
  46. event = meta$event,
  47. stage = meta$stage,
  48. race = meta$race)
  49. p1 = ggbetweenstats(data = dat, x = event, y = gene,title = "PBRM1")
  50. p2 = ggbetweenstats(data = dat, x = stage, y = gene,title = "PBRM1")
  51. p3 = ggbetweenstats(data = dat, x = race, y = gene,title = "PBRM1")
  52. library(patchwork)
  53. p1/p2/p3
  54. 3.根据某个基因是否突变分组比较某基因的表达量
  55. dim(exprSet)
  56. #> [1] 31798 513
  57. mut[1:4,1:4]
  58. #> Hugo_Symbol Chromosome Start_Position End_Position
  59. #> 1 NMT2 10 15154934 15154934
  60. #> 2 ARHGAP42 11 100845194 100845194
  61. #> 3 RAG1 11 36597064 36597064
  62. #> 4 NLRP14 11 7064166 7064166
  63. library(stringr)
  64. length(unique(str_sub(mut$Tumor_Sample_Barcode,1,16)))
  65. #> [1] 370
  66. k = colnames(exprSet) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16));table(k)
  67. #> k
  68. #> FALSE TRUE
  69. #> 163 350
  70. #513个样本中,有350个有突变信息记录,将这些样本对应的表达矩阵取出来。
  71. expm = exprSet[,k]
  72. library(dplyr)
  73. VHL_mut = mut %>%
  74. filter(Hugo_Symbol=='VHL') %>%
  75. as.data.frame() %>%
  76. pull(Tumor_Sample_Barcode) %>%
  77. as.character() %>%
  78. str_sub(1,16)
  79. #false 是未突变样本,true是突变样本
  80. tail(rownames(expm))
  81. #> [1] "MT-ND5" "MT-ND6" "MT-TE" "MT-CYB" "MT-TT" "MT-TP"
  82. dat=data.frame(gene=expm['PBRM1',],
  83. mut= str_sub(colnames(expm),1,16) %in% VHL_mut)
  84. ggbetweenstats(data = dat, x = mut, y = gene)
  85. #可以计算每个基因的p值,找找是不是有显著的。
  86. res <- t.test(gene ~ as.factor(mut), data = dat)
  87. res$p.value
  88. #> [1] 0.1138761
  89. #自己试试能不能写出来这个循环?

任意基因的相关性分析

  1. corralation
  2. 0.输入数据和R
  3. rm(list=ls())
  4. library(ggpubr)
  5. library(stringr)
  6. load(file = "explore.Rdata")
  7. 15_boxplot的输入数据相同
  8. library(dplyr)
  9. VHL_mut = mut %>%
  10. filter(Hugo_Symbol=='VHL') %>%
  11. as.data.frame() %>%
  12. pull(Tumor_Sample_Barcode) %>%
  13. as.character() %>%
  14. str_sub(1,16)
  15. 2.任意两个基因的相关性分析
  16. 2.1 简单绘图
  17. 使用ggpbur
  18. dat=data.frame(gene1=exprSet['SETD2',],
  19. gene2=exprSet['PBRM1',],
  20. stage=meta$stage)
  21. sp1 <- ggscatter(dat, x = "gene1", y = "gene2",
  22. add = "reg.line", # Add regressin line
  23. add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
  24. conf.int = TRUE # Add confidence interval
  25. ) + stat_cor(method = "pearson")
  26. sp1
  27. 2.2 按照stage分组
  28. 不仅stage,任意在meta信息中能找到或生产的分组都可以。
  29. sp2 <- ggscatter( dat, x = "gene1", y = "gene2",
  30. color = "stage", palette = "jco",
  31. add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))
  32. sp2
  33. 2.3 按照是否突变来分组
  34. 理论上某个是否突变并不会改变某两个基因的相关性趋势,如果有这种特殊的突变,打乱了两个基因之间正常的相关关系,机制就有了。可以写循环试一下是否有这样的基因突变。
  35. expm = exprSet[,str_sub(colnames(exprSet),1,16) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16))]
  36. dat=data.frame(gene1=expm['SETD2',],
  37. gene2=expm['PBRM1',],
  38. mut= str_sub(colnames(expm),1,16) %in% VHL_mut)
  39. sp3 <- ggscatter( dat, x = "gene1", y = "gene2",
  40. color = "mut", palette = "jco",
  41. add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = mut))
  42. sp3
  43. 3.分面
  44. sp2 + facet_wrap(~stage, scales = "free_x")
  45. sp3 + facet_wrap(~mut, scales = "free_x")
  46. 分面的列有缺失值?导致了NA也占有一个子图。有两个选择,一个是换一列来分面,一个是作图数据中去除缺失值
  47. library(tidyr)
  48. dat=data.frame(gene1=exprSet['SETD2',],
  49. gene2=exprSet['PBRM1',],
  50. stage=meta$stage)
  51. dat = drop_na(dat,stage)
  52. ggscatter( dat, x = "gene1", y = "gene2",
  53. color = "stage", palette = "jco",
  54. add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))+
  55. facet_wrap(~stage, scales = "free_x")