随机森林模型
randomforest生信技能树1.准备输入数据输入数据是肿瘤样本表达矩阵exprSet、临床信息metaproj = "TCGA-KIRC"load(paste0(proj,"_sur_model.Rdata"))load(paste0(proj,"_logrank_cox_gene.Rdata"))g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用exprSet = exprSet[g,]数据量太大需要运行很久,这里再缩小一点范围,用自己的数据时删掉这段代码exprSet = exprSet[1:1000,1:100]meta = meta[1:100,]library(randomForest)library(ROCR)library(genefilter)library(Hmisc)2.构建随机森林模型输入数据是表达矩阵(仅含tumor样本)和对应的生死。x=t(exprSet)y=meta$event#构建模型,一个叫randomForest的函数,运行时间很长,存Rdata跳过tmp_rf=paste0(proj,"_rf_output.Rdata")if(!file.exists(tmp_rf)){ rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001, proximity=TRUE ) save(rf_output,file = tmp_rf)}load(file = tmp_rf)#top30的基因varImpPlot(rf_output, type=2, n.var=30, scale=FALSE, main="Variable Importance (Gini) for top 30 predictors",cex =.7)rf_importances=importance(rf_output, scale=FALSE)head(rf_importances)#> %IncMSE IncNodePurity#> WASH7P 1.081871e-04 4.732143e-03#> AL627309.6 -4.949768e-05 7.355461e-02#> WASH9P -1.935604e-04 2.077797e-02#> MTATP6P1 0.000000e+00 3.552714e-18#> LINC01409 9.696970e-05 6.891663e-03#> LINC00115 -5.882353e-05 9.679104e-03#解释量top30的基因,和图上是一样的,从下往上看。choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))3.模型预测和评估rf.prob <- predict(rf_output, x)re=cbind(y ,rf.prob)head(re)#> y rf.prob#> TCGA-B8-5162-01A 0 0.2658333#> TCGA-B4-5834-01A 0 0.0769000#> TCGA-CW-6087-01A 1 0.8897333#> TCGA-B0-4698-01A 1 0.9392000#> TCGA-B0-4690-01A 1 0.9759000#> TCGA-AS-3778-01A 0 0.1081333ROC曲线library(ROCR)#library(caret)pred <- prediction(re[,2], re[,1])auc = performance(pred,"auc")@y.values[[1]]auc#> [1] 14.切割数据构建模型并预测4.1 切割数据用R包caret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。library(caret)set.seed(12345679)sam<- createDataPartition(meta$event, p = .5,list = FALSE)train <- exprSet[,sam]test <- exprSet[,-sam]train_meta <- meta[sam,]test_meta <- meta[-sam,]4.2 切割后的train数据集建模和上面的建模方法一样。#建模x = t(train)y = train_meta$eventtmp_rf=paste0(proj,"_t_rf_output.Rdata")if(!file.exists(tmp_rf)){ rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001, proximity=TRUE ) save(rf_output,file = tmp_rf)}load(file = tmp_rf)choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))head(choose_gene)#> [1] "PUSL1" "CFAP45" "ISG15" "CGN" "IBA57" #> [6] "AC104794.5"5.模型预测用训练集构建模型,预测测试集的生死。x = t(test)y = test_meta$eventrf.prob <- predict(rf_output, x)re=cbind(y ,rf.prob)re=as.data.frame(re)colnames(re)=c('event','prob')re$event=as.factor(re$event)再看AUC值。library(ROCR)# 训练集模型预测测试集pred <- prediction(re[,2], re[,1])auc= performance(pred,"auc")@y.values[[1]]auc#> [1] 0.8181818还行。
支持向量机模型
svm生信技能树1.准备输入数据proj = "TCGA-KIRC"load(paste0(proj,"_sur_model.Rdata"))library(ROCR)library(genefilter)library(Hmisc)#> Warning: package 'Hmisc' was built under R version 4.1.3#> Warning: package 'survival' was built under R version 4.1.3library(e1071)2.构建支持向量机模型2.1.切割数据用R包caret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。library(caret)#> Warning: package 'caret' was built under R version 4.1.3set.seed(12345679)sam<- createDataPartition(meta$event, p = .5,list = FALSE)train <- exprSet[,sam]test <- exprSet[,-sam]train_meta <- meta[sam,]test_meta <- meta[-sam,]2.2 train数据集建模x=t(train)y=as.factor(train_meta$event)model = svm(x,y,kernel = "linear")summary(model) #> #> Call:#> svm.default(x = x, y = y, kernel = "linear")#> #> #> Parameters:#> SVM-Type: C-classification #> SVM-Kernel: linear #> cost: 1 #> #> Number of Support Vectors: 221#> #> ( 75 146 )#> #> #> Number of Classes: 2 #> #> Levels: #> 0 12.3.模型预测用训练集构建模型,预测测试集的生死。不同于其他模型,这个预测结果是分类变量,直接预测生死,而不是probx=t(test)y=as.factor(test_meta$event)pred = predict(model, x)table(pred,y)#> y#> pred 0 1#> 0 153 44#> 1 17 42
time-ROC曲线
timeROC生信技能树1.构建模型并完成预测load("TCGA-KIRC_sur_model.Rdata")load("TCGA-KIRC_logrank_cox_gene.Rdata")g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用exprSet = exprSet[g,]x=t(exprSet)y=meta$eventlibrary(glmnet)#> Warning: package 'glmnet' was built under R version 4.1.3#> Warning: package 'Matrix' was built under R version 4.1.3cv_fit <- cv.glmnet(x=x, y=y)model_lasso_min <- glmnet(x=x, y=y, lambda=cv_fit$lambda.min)choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]lasso.prob <- predict(cv_fit, newx=x , s=cv_fit$lambda.min )head(lasso.prob)#> s1#> TCGA-B8-5162-01A 0.35258903#> TCGA-B4-5834-01A 0.06925658#> TCGA-CW-6087-01A 0.70950202#> TCGA-B0-4698-01A 0.61516034#> TCGA-B0-4690-01A 0.82493174#> TCGA-AS-3778-01A -0.11996394dat = cbind(meta[,1:3], fp = lasso.prob[,1])head(dat)#> ID event time fp#> TCGA-B8-5162-01A TCGA-B8-5162-01A 0 1.200000 0.35258903#> TCGA-B4-5834-01A TCGA-B4-5834-01A 0 1.266667 0.06925658#> TCGA-CW-6087-01A TCGA-CW-6087-01A 1 1.366667 0.70950202#> TCGA-B0-4698-01A TCGA-B0-4698-01A 1 1.400000 0.61516034#> TCGA-B0-4690-01A TCGA-B0-4690-01A 1 1.433333 0.82493174#> TCGA-AS-3778-01A TCGA-AS-3778-01A 0 1.433333 -0.119963942 time-ROC计算和出图library(survminer)library(survival)#> Warning: package 'survival' was built under R version 4.1.3library(timeROC)result <-with(dat, timeROC(T=time, delta=event, marker=fp, cause=1, times=c(12,36,96), iid = TRUE))plot_dat = data.frame(fpr = as.numeric(result$FP), tpr = as.numeric(result$TP), time = rep(as.factor(c(12,36,96)),each = nrow(result$TP)))library(ggplot2)ggplot() + geom_line(data = plot_dat,aes(x = fpr, y = tpr,color = time),size = 1) + scale_color_manual(name = NULL,values = c("#2874C5", "#f87669", "#e6b707"), labels = paste0("AUC of ",c(1,3,5),"-y survival: ", format(round(result$AUC,2),nsmall = 2)))+ geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+ theme_bw()+ theme(panel.grid = element_blank(), legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"), legend.position = c(0.765,0.125))+ scale_x_continuous(expand = c(0.005,0.005))+ scale_y_continuous(expand = c(0.005,0.005))+ labs(x = "1 - Specificity", y = "Sensitivity")+ coord_fixed()
风险因子三图联动
三图联动生信技能树1.构建cox模型并预测proj = "TCGA-KIRC"load(paste0(proj,"_sur_model.Rdata"))load(paste0(proj,"_lasso_choose_gene_1se.Rdata"))library(survival)library(survminer)library(ggplotify)library(cowplot)library(Hmisc)options(scipen=200)library(pheatmap)library(stringr)e=t(exprSet[choose_gene_1se,])colnames(e)= str_replace_all(colnames(e),"-","_")dat=cbind(meta,e)dat$gender=as.numeric(factor(dat$gender))dat$stage=as.numeric(factor(dat$stage))colnames(dat)#> [1] "ID" "event" "time" "race" "age" #> [6] "gender" "stage" "AL391244.2" "AJAP1" "AL357140.2"#> [11] "CPLANE2" "RCC2" "AL137127.1" "NIPAL3" "KDF1" #> [16] "LINC01389" "ANGPTL3" "GADD45A" "LINC01725" "LINC02609" #> [21] "CHI3L2" "LINC01719" "KCNN3" "MSTO1" "PMF1_BGLAP"#> [26] "CRABP2" "CD1E" "FCER1A" "NUF2" "DUTP6" #> [31] "IGFN1" "NEK2" "PROX1" "DUSP5P1" "HS1BP3_IT1"#> [36] "OTOF" "CENPA" "SLC5A6" "PRKCE" "SPTBN1"library(survival)library(survminer)dat2 = na.omit(dat)model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + AL391244.2 + age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 + LINC01719 + DUTP6, data = dat2)ggforest(model, data =dat2)fp <- predict(model,dat2)head(fp)#> TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A #> -0.2267798 -1.2751770 2.0868264 2.6742892 #> TCGA-B0-4690-01A TCGA-AS-3778-01A #> 2.4891564 -2.31987742.划分高低风险并画生存分析图names(fp) = dat2$IDri = ifelse(fp<median(fp),"lowrisk","highrisk")ri = factor(ri,levels = c("lowrisk","highrisk"))sfit <- survfit(Surv(time, event)~ri, data=dat2)ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"), risk.table =TRUE,pval =TRUE, conf.int =TRUE,xlab ="Time in months", ggtheme =theme_light())3.风险因子三图联动fp_dat=data.frame(patientid=1:length(fp), fp=as.numeric(sort(fp)), ri= ri[order(fp)])sur_dat=data.frame(patientid=1:length(fp), time=dat2[order(fp),'time'] , event=dat2[order(fp),'event']) sur_dat$event=ifelse(sur_dat$event==0,'alive','death')sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))library(stringr)gs = str_split("stage + AL357140.2 + AL391244.2 + age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 + LINC01719 + DUTP6"," \\+ ")[[1]]gs = gs[gs %in% rownames(exprSet)]exp_dat = t(dat2[order(fp),gs])colnames(exp_dat) = str_sub(colnames(exp_dat),1,12)rownames(exp_dat) = str_replace_all(rownames(exp_dat),"_","-")###第一个图----p1=ggplot(fp_dat,aes(x=patientid,y=fp))+ geom_point(aes(color = ri))+ scale_color_manual(values = c("#2874C5", "#f87669"))+ geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+ scale_x_continuous(expand=c(0.01,0))+ labs(x = "",y = "risk score")+ theme_bw()#第二个图p2=ggplot(sur_dat,aes(x=patientid,y=time))+ geom_point(aes(col=event))+ scale_color_manual(values = c("#f87669","#2874C5"))+ geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+ scale_x_continuous(expand=c(0.01,0))+ labs(x = "")+ theme_bw()#第三个图annotation_col = data.frame(group = ri, row.names = names(ri))mycolors <- colorRampPalette(c("#2874C5","white", "#f87669"), bias = 1.2)(100)ann_colors = list( group = c(lowrisk="#2874C5", highrisk="#f87669"))p3=pheatmap::pheatmap(exp_dat, col= mycolors, annotation_col = annotation_col, annotation_colors =ann_colors, scale = "row", breaks = seq(-3,3,length.out = 100), show_colnames = F, cluster_cols = F, annotation_legend = F)#拼图实现三图联动library(patchwork)p1/p2/as.ggplot(p3)# 暂时的补救办法library(tinyarray)n = scale(t(exp_dat))n[n > 3] = 3n[n < -3] = -3p3 = ggheat(n,fp_dat$ri,show_rownames = F)+ theme(axis.text = element_text(size = 8))p3p1 /p2 /p3 + plot_layout(design = 'A B C C C C')ggsave("risk_plot.png",height = 10,width = 10)从上向下三个图分别表示:1.每个病人的预测值,按照从小到大排序 2.每个病人的生存时间,颜色区分生死 3.热图,挑出的基因在每个样本中的表达量
maf文件下载和整理
maftools1.数据获取使用github上的R包TCGAmutations来获取数据library(TCGAmutations)library(maftools) library(dplyr)proj='TCGA-KIRC'laml = tcga_load(study = "KIRC") laml #> An object of class MAF #> ID summary Mean Median#> 1: NCBI_Build NA NA NA#> 2: Center NA NA NA#> 3: Samples 370 NA NA#> 4: nGenes 10243 NA NA#> 5: Frame_Shift_Del 1792 4.843 4#> 6: Frame_Shift_Ins 463 1.251 1#> 7: In_Frame_Del 270 0.730 0#> 8: In_Frame_Ins 26 0.070 0#> 9: Missense_Mutation 17050 46.081 42#> 10: Nonsense_Mutation 1297 3.505 3#> 11: Nonstop_Mutation 33 0.089 0#> 12: Splice_Site 672 1.816 1#> 13: Translation_Start_Site 34 0.092 0#> 14: total 21637 58.478 53length(unique(laml@data$Tumor_Sample_Barcode))#> [1] 370length(unique(laml@data$Hugo_Symbol))#> [1] 10243有370个病人,10243个突变基因的信息。3.突变数据的可视化3.1 plotmafSummarymaftools 自带可视化函数plotmafSummary,可以比较直观的统计maf文件的数据。#if (as.numeric(dev.cur()) != 1) graphics.off()plotmafSummary(maf = laml, rmOutlier = TRUE, #showBarcodes = FALSE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)就是将laml@data数据框做了统计,用barplot和boxplot做了可视化。3.2 突变频谱图可以选择突变数量前30的基因,也可以指定感兴趣的几个基因oncoplot(maf = laml, top = 30, fontSize = 0.7)g = c( "RBM43", "VHL", "SETD2", "PBRM1")oncoplot(maf = laml,genes = g, fontSize = 0.7)3.2 可以给oncoplot加上一些临床信息pd 是临床信息数据框,第一列是Tumor_Sample_Barcode,后面几列是各种分组,如gender,age,stage等等。colnames(laml@clinical.data)[57] = "stage"oncoplot(maf = laml, sortByAnnotation = TRUE, clinicalFeatures = c("stage", 'gender') )自定义颜色col = RColorBrewer::brewer.pal(n = 8,name = 'Set3')stagecolors = col[1:4]names(stagecolors) = na.omit(unique(laml@clinical.data$stage))gendercolors = col[5:6]names(gendercolors) = unique(laml@clinical.data$gender)ancolors = list(stage = stagecolors, gender = gendercolors)oncoplot(maf = laml, sortByAnnotation = TRUE, clinicalFeatures = c("stage", 'gender'), annotationColor = ancolors, )
突变signature
signafiture0.R包rm(list=ls())proj='TCGA-KIRC'library(maftools) library(dplyr)library(pheatmap)library(ComplexHeatmap)library(circlize)library(stringr)library(deconstructSigs)library(BSgenome)library(BSgenome.Hsapiens.UCSC.hg19)1.读取突变数据maf是和上一篇一样的数据,从gdc下载的maf文件和临床信息library(TCGAmutations)library(maftools) library(dplyr)proj='TCGA-KIRC'laml = tcga_load(study = "KIRC") laml #> An object of class MAF #> ID summary Mean Median#> 1: NCBI_Build NA NA NA#> 2: Center NA NA NA#> 3: Samples 370 NA NA#> 4: nGenes 10243 NA NA#> 5: Frame_Shift_Del 1792 4.843 4#> 6: Frame_Shift_Ins 463 1.251 1#> 7: In_Frame_Del 270 0.730 0#> 8: In_Frame_Ins 26 0.070 0#> 9: Missense_Mutation 17050 46.081 42#> 10: Nonsense_Mutation 1297 3.505 3#> 11: Nonstop_Mutation 33 0.089 0#> 12: Splice_Site 672 1.816 1#> 13: Translation_Start_Site 34 0.092 0#> 14: total 21637 58.478 53mut=laml@datamut[1:4,1:2]#> Hugo_Symbol Chromosome#> 1: NMT2 10#> 2: ARHGAP42 11#> 3: RAG1 11#> 4: NLRP14 11mut=mut[mut$Variant_Type=='SNP',]#每行记录了一个突变,所以统计样本列的行数即为样本的突变数量plot(table(mut[,16]),las = 2)2.制作signatures的输入数据(96频谱)关于什么是mutationsignature:http://www.bio-info-trainee.com/1619.html关于96频谱:http://www.biotrainee.com/thread-832-1-1.htmla = mut[,c(16,2,3,5,6)]colnames(a)=c( "Sample","chr", "pos","ref", "alt")a$Sample=as.character(a$Sample)a$chr = paste0("chr",a$chr)sigs.input <- mut.to.sigs.input(mut.ref = a, sample.id = "Sample", chr = "chr", pos = "pos", ref = "ref", alt = "alt", bsg = BSgenome.Hsapiens.UCSC.hg19)class(sigs.input)#> [1] "data.frame"#第一个样本的突变绘图barplot(as.numeric(sigs.input[1,])~colnames(sigs.input))sigs.input[1:4,1:4]#> A[C>A]A A[C>A]C A[C>A]G A[C>A]T#> TCGA-3Z-A93Z-01A-11D-A36X-10 0 0 0 0#> TCGA-6D-AA2E-01A-11D-A36X-10 0 0 0 0#> TCGA-A3-3308-01A-01D-0966-08 0 0 0 0#> TCGA-A3-3311-01A-01D-0966-08 0 0 0 0一顿操作猛如虎,生成signature与样本对应关系的矩阵if(!file.exists(paste0(proj,"_w.Rdata"))){ w=lapply(unique(a$Sample), function(i){ ## signatures.cosmic signatures.nature2013 sample_1 = whichSignatures(tumor.ref = sigs.input, signatures.ref = signatures.cosmic, sample.id = i, contexts.needed = TRUE, tri.counts.method = 'default') print(c(i,which(unique(a$Sample)==i))) return(sample_1$weights) }) w=do.call(rbind,w) save(w,file = paste0(proj,"_w.Rdata"))}load(paste0(proj,"_w.Rdata"))mat = t(w)3.可视化矩阵的可视化–热图pheatmap::pheatmap(mat, cluster_rows = F, cluster_cols = T, show_colnames = F)
任意基因的任意分组比较
任意基因的任意分组比较0.输入数据rm(list=ls())proj = "TCGA-KIRC"f = "explore.Rdata"if(!file.exists(f)){ load(paste0(proj,".Rdata")) load(paste0(proj,"_sur_model.Rdata")) library(TCGAmutations) library(maftools) laml = tcga_load(study = "KIRC") mut = laml@data save(exp,Group,exprSet,meta,mut,file = f)}load(f)这里面的数据:exp是tumor-normal都有的表达矩阵,exprSet是只有tumor样本的表达矩阵。meta是临床信息表格,Group是tumor-normal分组信息。mut是突变信息,由maf文件读取并取子集得到。1.比较任意基因在tumor和normal样本中的表达量以PBRM1为例画图,可替换为其他任意基因。table(Group)#> Group#> normal tumor #> 72 535library(ggstatsplot)exp = log2(edgeR::cpm(exp)+1)dat = data.frame(gene = exp["PBRM1",], group = Group)ggbetweenstats(data = dat, x = group, y = gene,title = "PBRM1")2.任意基因在任意两个分组中的表达量对比只要是可以根据临床信息查到或得到的分组,例如生死、人种、分期,都可以拿来做分组。需要注意调整样本顺序,一一对应。#按照生死、人种、分期分组看看table(meta$event)#> #> 0 1 #> 344 169table(meta$stage)#> #> I II III IV #> 255 56 117 82table(meta$race)#> #> asian black or african american white #> 8 51 447dat = data.frame(gene = exprSet["PBRM1",], event = meta$event, stage = meta$stage, race = meta$race)p1 = ggbetweenstats(data = dat, x = event, y = gene,title = "PBRM1")p2 = ggbetweenstats(data = dat, x = stage, y = gene,title = "PBRM1")p3 = ggbetweenstats(data = dat, x = race, y = gene,title = "PBRM1")library(patchwork)p1/p2/p33.根据某个基因是否突变分组比较某基因的表达量dim(exprSet)#> [1] 31798 513mut[1:4,1:4]#> Hugo_Symbol Chromosome Start_Position End_Position#> 1 NMT2 10 15154934 15154934#> 2 ARHGAP42 11 100845194 100845194#> 3 RAG1 11 36597064 36597064#> 4 NLRP14 11 7064166 7064166library(stringr)length(unique(str_sub(mut$Tumor_Sample_Barcode,1,16)))#> [1] 370k = colnames(exprSet) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16));table(k)#> k#> FALSE TRUE #> 163 350#513个样本中,有350个有突变信息记录,将这些样本对应的表达矩阵取出来。expm = exprSet[,k]library(dplyr)VHL_mut = mut %>% filter(Hugo_Symbol=='VHL') %>% as.data.frame() %>% pull(Tumor_Sample_Barcode) %>% as.character() %>% str_sub(1,16)#false 是未突变样本,true是突变样本tail(rownames(expm))#> [1] "MT-ND5" "MT-ND6" "MT-TE" "MT-CYB" "MT-TT" "MT-TP"dat=data.frame(gene=expm['PBRM1',], mut= str_sub(colnames(expm),1,16) %in% VHL_mut)ggbetweenstats(data = dat, x = mut, y = gene)#可以计算每个基因的p值,找找是不是有显著的。res <- t.test(gene ~ as.factor(mut), data = dat)res$p.value#> [1] 0.1138761#自己试试能不能写出来这个循环?
任意基因的相关性分析
corralation0.输入数据和R包rm(list=ls())library(ggpubr)library(stringr)load(file = "explore.Rdata")与15_boxplot的输入数据相同library(dplyr)VHL_mut = mut %>% filter(Hugo_Symbol=='VHL') %>% as.data.frame() %>% pull(Tumor_Sample_Barcode) %>% as.character() %>% str_sub(1,16)2.任意两个基因的相关性分析2.1 简单绘图使用ggpbur。dat=data.frame(gene1=exprSet['SETD2',], gene2=exprSet['PBRM1',], stage=meta$stage)sp1 <- ggscatter(dat, x = "gene1", y = "gene2", add = "reg.line", # Add regressin line add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line conf.int = TRUE # Add confidence interval) + stat_cor(method = "pearson")sp12.2 按照stage分组不仅stage,任意在meta信息中能找到或生产的分组都可以。sp2 <- ggscatter( dat, x = "gene1", y = "gene2", color = "stage", palette = "jco", add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))sp22.3 按照是否突变来分组理论上某个是否突变并不会改变某两个基因的相关性趋势,如果有这种特殊的突变,打乱了两个基因之间正常的相关关系,机制就有了。可以写循环试一下是否有这样的基因突变。expm = exprSet[,str_sub(colnames(exprSet),1,16) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16))]dat=data.frame(gene1=expm['SETD2',], gene2=expm['PBRM1',], mut= str_sub(colnames(expm),1,16) %in% VHL_mut)sp3 <- ggscatter( dat, x = "gene1", y = "gene2", color = "mut", palette = "jco", add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = mut))sp33.分面sp2 + facet_wrap(~stage, scales = "free_x")sp3 + facet_wrap(~mut, scales = "free_x")分面的列有缺失值?导致了NA也占有一个子图。有两个选择,一个是换一列来分面,一个是作图数据中去除缺失值library(tidyr)dat=data.frame(gene1=exprSet['SETD2',], gene2=exprSet['PBRM1',], stage=meta$stage)dat = drop_na(dat,stage)ggscatter( dat, x = "gene1", y = "gene2", color = "stage", palette = "jco", add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))+ facet_wrap(~stage, scales = "free_x")