随机森林模型
randomforest
生信技能树
1.准备输入数据
输入数据是肿瘤样本表达矩阵exprSet、临床信息meta
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_model.Rdata"))
load(paste0(proj,"_logrank_cox_gene.Rdata"))
g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用
exprSet = exprSet[g,]
数据量太大需要运行很久,这里再缩小一点范围,用自己的数据时删掉这段代码
exprSet = exprSet[1:1000,1:100]
meta = meta[1:100,]
library(randomForest)
library(ROCR)
library(genefilter)
library(Hmisc)
2.构建随机森林模型
输入数据是表达矩阵(仅含tumor样本)和对应的生死。
x=t(exprSet)
y=meta$event
#构建模型,一个叫randomForest的函数,运行时间很长,存Rdata跳过
tmp_rf=paste0(proj,"_rf_output.Rdata")
if(!file.exists(tmp_rf)){
rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001,
proximity=TRUE )
save(rf_output,file = tmp_rf)
}
load(file = tmp_rf)
#top30的基因
varImpPlot(rf_output, type=2, n.var=30, scale=FALSE,
main="Variable Importance (Gini) for top 30 predictors",cex =.7)
rf_importances=importance(rf_output, scale=FALSE)
head(rf_importances)
#> %IncMSE IncNodePurity
#> WASH7P 1.081871e-04 4.732143e-03
#> AL627309.6 -4.949768e-05 7.355461e-02
#> WASH9P -1.935604e-04 2.077797e-02
#> MTATP6P1 0.000000e+00 3.552714e-18
#> LINC01409 9.696970e-05 6.891663e-03
#> LINC00115 -5.882353e-05 9.679104e-03
#解释量top30的基因,和图上是一样的,从下往上看。
choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))
3.模型预测和评估
rf.prob <- predict(rf_output, x)
re=cbind(y ,rf.prob)
head(re)
#> y rf.prob
#> TCGA-B8-5162-01A 0 0.2658333
#> TCGA-B4-5834-01A 0 0.0769000
#> TCGA-CW-6087-01A 1 0.8897333
#> TCGA-B0-4698-01A 1 0.9392000
#> TCGA-B0-4690-01A 1 0.9759000
#> TCGA-AS-3778-01A 0 0.1081333
ROC曲线
library(ROCR)
#library(caret)
pred <- prediction(re[,2], re[,1])
auc = performance(pred,"auc")@y.values[[1]]
auc
#> [1] 1
4.切割数据构建模型并预测
4.1 切割数据
用R包caret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
library(caret)
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .5,list = FALSE)
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
4.2 切割后的train数据集建模
和上面的建模方法一样。
#建模
x = t(train)
y = train_meta$event
tmp_rf=paste0(proj,"_t_rf_output.Rdata")
if(!file.exists(tmp_rf)){
rf_output=randomForest(x=x, y=y,importance = TRUE, #ntree = 10001,
proximity=TRUE )
save(rf_output,file = tmp_rf)
}
load(file = tmp_rf)
choose_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],30))
head(choose_gene)
#> [1] "PUSL1" "CFAP45" "ISG15" "CGN" "IBA57"
#> [6] "AC104794.5"
5.模型预测
用训练集构建模型,预测测试集的生死。
x = t(test)
y = test_meta$event
rf.prob <- predict(rf_output, x)
re=cbind(y ,rf.prob)
re=as.data.frame(re)
colnames(re)=c('event','prob')
re$event=as.factor(re$event)
再看AUC值。
library(ROCR)
# 训练集模型预测测试集
pred <- prediction(re[,2], re[,1])
auc= performance(pred,"auc")@y.values[[1]]
auc
#> [1] 0.8181818
还行。
支持向量机模型
svm
生信技能树
1.准备输入数据
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_model.Rdata"))
library(ROCR)
library(genefilter)
library(Hmisc)
#> Warning: package 'Hmisc' was built under R version 4.1.3
#> Warning: package 'survival' was built under R version 4.1.3
library(e1071)
2.构建支持向量机模型
2.1.切割数据
用R包caret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
library(caret)
#> Warning: package 'caret' was built under R version 4.1.3
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .5,list = FALSE)
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
2.2 train数据集建模
x=t(train)
y=as.factor(train_meta$event)
model = svm(x,y,kernel = "linear")
summary(model)
#>
#> Call:
#> svm.default(x = x, y = y, kernel = "linear")
#>
#>
#> Parameters:
#> SVM-Type: C-classification
#> SVM-Kernel: linear
#> cost: 1
#>
#> Number of Support Vectors: 221
#>
#> ( 75 146 )
#>
#>
#> Number of Classes: 2
#>
#> Levels:
#> 0 1
2.3.模型预测
用训练集构建模型,预测测试集的生死。不同于其他模型,这个预测结果是分类变量,直接预测生死,而不是prob
x=t(test)
y=as.factor(test_meta$event)
pred = predict(model, x)
table(pred,y)
#> y
#> pred 0 1
#> 0 153 44
#> 1 17 42
time-ROC曲线
timeROC
生信技能树
1.构建模型并完成预测
load("TCGA-KIRC_sur_model.Rdata")
load("TCGA-KIRC_logrank_cox_gene.Rdata")
g = cox[1:1000] #这里是示例,取了1000个单因素cox筛选的基因,活学活用
exprSet = exprSet[g,]
x=t(exprSet)
y=meta$event
library(glmnet)
#> Warning: package 'glmnet' was built under R version 4.1.3
#> Warning: package 'Matrix' was built under R version 4.1.3
cv_fit <- cv.glmnet(x=x, y=y)
model_lasso_min <- glmnet(x=x, y=y, lambda=cv_fit$lambda.min)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
lasso.prob <- predict(cv_fit, newx=x , s=cv_fit$lambda.min )
head(lasso.prob)
#> s1
#> TCGA-B8-5162-01A 0.35258903
#> TCGA-B4-5834-01A 0.06925658
#> TCGA-CW-6087-01A 0.70950202
#> TCGA-B0-4698-01A 0.61516034
#> TCGA-B0-4690-01A 0.82493174
#> TCGA-AS-3778-01A -0.11996394
dat = cbind(meta[,1:3],
fp = lasso.prob[,1])
head(dat)
#> ID event time fp
#> TCGA-B8-5162-01A TCGA-B8-5162-01A 0 1.200000 0.35258903
#> TCGA-B4-5834-01A TCGA-B4-5834-01A 0 1.266667 0.06925658
#> TCGA-CW-6087-01A TCGA-CW-6087-01A 1 1.366667 0.70950202
#> TCGA-B0-4698-01A TCGA-B0-4698-01A 1 1.400000 0.61516034
#> TCGA-B0-4690-01A TCGA-B0-4690-01A 1 1.433333 0.82493174
#> TCGA-AS-3778-01A TCGA-AS-3778-01A 0 1.433333 -0.11996394
2 time-ROC计算和出图
library(survminer)
library(survival)
#> Warning: package 'survival' was built under R version 4.1.3
library(timeROC)
result <-with(dat, timeROC(T=time,
delta=event,
marker=fp,
cause=1,
times=c(12,36,96),
iid = TRUE))
plot_dat = data.frame(fpr = as.numeric(result$FP),
tpr = as.numeric(result$TP),
time = rep(as.factor(c(12,36,96)),each = nrow(result$TP)))
library(ggplot2)
ggplot() +
geom_line(data = plot_dat,aes(x = fpr, y = tpr,color = time),size = 1) +
scale_color_manual(name = NULL,values = c("#2874C5", "#f87669", "#e6b707"),
labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
format(round(result$AUC,2),nsmall = 2)))+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
theme(panel.grid = element_blank(),
legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
legend.position = c(0.765,0.125))+
scale_x_continuous(expand = c(0.005,0.005))+
scale_y_continuous(expand = c(0.005,0.005))+
labs(x = "1 - Specificity",
y = "Sensitivity")+
coord_fixed()
风险因子三图联动
三图联动
生信技能树
1.构建cox模型并预测
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_model.Rdata"))
load(paste0(proj,"_lasso_choose_gene_1se.Rdata"))
library(survival)
library(survminer)
library(ggplotify)
library(cowplot)
library(Hmisc)
options(scipen=200)
library(pheatmap)
library(stringr)
e=t(exprSet[choose_gene_1se,])
colnames(e)= str_replace_all(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=as.numeric(factor(dat$gender))
dat$stage=as.numeric(factor(dat$stage))
colnames(dat)
#> [1] "ID" "event" "time" "race" "age"
#> [6] "gender" "stage" "AL391244.2" "AJAP1" "AL357140.2"
#> [11] "CPLANE2" "RCC2" "AL137127.1" "NIPAL3" "KDF1"
#> [16] "LINC01389" "ANGPTL3" "GADD45A" "LINC01725" "LINC02609"
#> [21] "CHI3L2" "LINC01719" "KCNN3" "MSTO1" "PMF1_BGLAP"
#> [26] "CRABP2" "CD1E" "FCER1A" "NUF2" "DUTP6"
#> [31] "IGFN1" "NEK2" "PROX1" "DUSP5P1" "HS1BP3_IT1"
#> [36] "OTOF" "CENPA" "SLC5A6" "PRKCE" "SPTBN1"
library(survival)
library(survminer)
dat2 = na.omit(dat)
model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + AL391244.2 +
age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 +
LINC01719 + DUTP6, data = dat2)
ggforest(model, data =dat2)
fp <- predict(model,dat2)
head(fp)
#> TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
#> -0.2267798 -1.2751770 2.0868264 2.6742892
#> TCGA-B0-4690-01A TCGA-AS-3778-01A
#> 2.4891564 -2.3198774
2.划分高低风险并画生存分析图
names(fp) = dat2$ID
ri = ifelse(fp<median(fp),"lowrisk","highrisk")
ri = factor(ri,levels = c("lowrisk","highrisk"))
sfit <- survfit(Surv(time, event)~ri, data=dat2)
ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
risk.table =TRUE,pval =TRUE,
conf.int =TRUE,xlab ="Time in months",
ggtheme =theme_light())
3.风险因子三图联动
fp_dat=data.frame(patientid=1:length(fp),
fp=as.numeric(sort(fp)),
ri= ri[order(fp)])
sur_dat=data.frame(patientid=1:length(fp),
time=dat2[order(fp),'time'] ,
event=dat2[order(fp),'event'])
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
library(stringr)
gs = str_split("stage + AL357140.2 + AL391244.2 + age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 + LINC01719 + DUTP6"," \\+ ")[[1]]
gs = gs[gs %in% rownames(exprSet)]
exp_dat = t(dat2[order(fp),gs])
colnames(exp_dat) = str_sub(colnames(exp_dat),1,12)
rownames(exp_dat) = str_replace_all(rownames(exp_dat),"_","-")
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
geom_point(aes(color = ri))+
scale_color_manual(values = c("#2874C5", "#f87669"))+
geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+
scale_x_continuous(expand=c(0.01,0))+
labs(x = "",y = "risk score")+
theme_bw()
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+
geom_point(aes(col=event))+
scale_color_manual(values = c("#f87669","#2874C5"))+
geom_vline(xintercept = 0.5*nrow(sur_dat),lty = 2)+
scale_x_continuous(expand=c(0.01,0))+
labs(x = "")+
theme_bw()
#第三个图
annotation_col = data.frame(group = ri,
row.names = names(ri))
mycolors <- colorRampPalette(c("#2874C5","white", "#f87669"), bias = 1.2)(100)
ann_colors = list(
group = c(lowrisk="#2874C5", highrisk="#f87669")
)
p3=pheatmap::pheatmap(exp_dat,
col= mycolors,
annotation_col = annotation_col,
annotation_colors =ann_colors,
scale = "row",
breaks = seq(-3,3,length.out = 100),
show_colnames = F,
cluster_cols = F,
annotation_legend = F)
#拼图实现三图联动
library(patchwork)
p1/p2/as.ggplot(p3)
# 暂时的补救办法
library(tinyarray)
n = scale(t(exp_dat))
n[n > 3] = 3
n[n < -3] = -3
p3 = ggheat(n,fp_dat$ri,show_rownames = F)+
theme(axis.text = element_text(size = 8))
p3
p1 /p2 /p3 + plot_layout(design = 'A
B
C
C
C
C')
ggsave("risk_plot.png",height = 10,width = 10)
从上向下三个图分别表示:
1.每个病人的预测值,按照从小到大排序 2.每个病人的生存时间,颜色区分生死 3.热图,挑出的基因在每个样本中的表达量
maf文件下载和整理
maftools
1.数据获取
使用github上的R包TCGAmutations来获取数据
library(TCGAmutations)
library(maftools)
library(dplyr)
proj='TCGA-KIRC'
laml = tcga_load(study = "KIRC")
laml
#> An object of class MAF
#> ID summary Mean Median
#> 1: NCBI_Build NA NA NA
#> 2: Center NA NA NA
#> 3: Samples 370 NA NA
#> 4: nGenes 10243 NA NA
#> 5: Frame_Shift_Del 1792 4.843 4
#> 6: Frame_Shift_Ins 463 1.251 1
#> 7: In_Frame_Del 270 0.730 0
#> 8: In_Frame_Ins 26 0.070 0
#> 9: Missense_Mutation 17050 46.081 42
#> 10: Nonsense_Mutation 1297 3.505 3
#> 11: Nonstop_Mutation 33 0.089 0
#> 12: Splice_Site 672 1.816 1
#> 13: Translation_Start_Site 34 0.092 0
#> 14: total 21637 58.478 53
length(unique(laml@data$Tumor_Sample_Barcode))
#> [1] 370
length(unique(laml@data$Hugo_Symbol))
#> [1] 10243
有370个病人,10243个突变基因的信息。
3.突变数据的可视化
3.1 plotmafSummary
maftools 自带可视化函数plotmafSummary,可以比较直观的统计maf文件的数据。
#if (as.numeric(dev.cur()) != 1) graphics.off()
plotmafSummary(maf = laml, rmOutlier = TRUE,
#showBarcodes = FALSE,
addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
就是将laml@data数据框做了统计,用barplot和boxplot做了可视化。
3.2 突变频谱图
可以选择突变数量前30的基因,也可以指定感兴趣的几个基因
oncoplot(maf = laml, top = 30, fontSize = 0.7)
g = c( "RBM43", "VHL", "SETD2", "PBRM1")
oncoplot(maf = laml,genes = g, fontSize = 0.7)
3.2 可以给oncoplot加上一些临床信息
pd 是临床信息数据框,第一列是Tumor_Sample_Barcode,后面几列是各种分组,如gender,age,stage等等。
colnames(laml@clinical.data)[57] = "stage"
oncoplot(maf = laml,
sortByAnnotation = TRUE,
clinicalFeatures = c("stage",
'gender')
)
自定义颜色
col = RColorBrewer::brewer.pal(n = 8,name = 'Set3')
stagecolors = col[1:4]
names(stagecolors) = na.omit(unique(laml@clinical.data$stage))
gendercolors = col[5:6]
names(gendercolors) = unique(laml@clinical.data$gender)
ancolors = list(stage = stagecolors,
gender = gendercolors)
oncoplot(maf = laml,
sortByAnnotation = TRUE,
clinicalFeatures = c("stage",
'gender'),
annotationColor = ancolors,
)
突变signature
signafiture
0.R包
rm(list=ls())
proj='TCGA-KIRC'
library(maftools)
library(dplyr)
library(pheatmap)
library(ComplexHeatmap)
library(circlize)
library(stringr)
library(deconstructSigs)
library(BSgenome)
library(BSgenome.Hsapiens.UCSC.hg19)
1.读取突变数据maf
是和上一篇一样的数据,从gdc下载的maf文件和临床信息
library(TCGAmutations)
library(maftools)
library(dplyr)
proj='TCGA-KIRC'
laml = tcga_load(study = "KIRC")
laml
#> An object of class MAF
#> ID summary Mean Median
#> 1: NCBI_Build NA NA NA
#> 2: Center NA NA NA
#> 3: Samples 370 NA NA
#> 4: nGenes 10243 NA NA
#> 5: Frame_Shift_Del 1792 4.843 4
#> 6: Frame_Shift_Ins 463 1.251 1
#> 7: In_Frame_Del 270 0.730 0
#> 8: In_Frame_Ins 26 0.070 0
#> 9: Missense_Mutation 17050 46.081 42
#> 10: Nonsense_Mutation 1297 3.505 3
#> 11: Nonstop_Mutation 33 0.089 0
#> 12: Splice_Site 672 1.816 1
#> 13: Translation_Start_Site 34 0.092 0
#> 14: total 21637 58.478 53
mut=laml@data
mut[1:4,1:2]
#> Hugo_Symbol Chromosome
#> 1: NMT2 10
#> 2: ARHGAP42 11
#> 3: RAG1 11
#> 4: NLRP14 11
mut=mut[mut$Variant_Type=='SNP',]
#每行记录了一个突变,所以统计样本列的行数即为样本的突变数量
plot(table(mut[,16]),las = 2)
2.制作signatures的输入数据(96频谱)
关于什么是mutation
signature:http://www.bio-info-trainee.com/1619.html
关于96频谱:http://www.biotrainee.com/thread-832-1-1.html
a = mut[,c(16,2,3,5,6)]
colnames(a)=c( "Sample","chr", "pos","ref", "alt")
a$Sample=as.character(a$Sample)
a$chr = paste0("chr",a$chr)
sigs.input <- mut.to.sigs.input(mut.ref = a,
sample.id = "Sample",
chr = "chr",
pos = "pos",
ref = "ref",
alt = "alt",
bsg = BSgenome.Hsapiens.UCSC.hg19)
class(sigs.input)
#> [1] "data.frame"
#第一个样本的突变绘图
barplot(as.numeric(sigs.input[1,])~colnames(sigs.input))
sigs.input[1:4,1:4]
#> A[C>A]A A[C>A]C A[C>A]G A[C>A]T
#> TCGA-3Z-A93Z-01A-11D-A36X-10 0 0 0 0
#> TCGA-6D-AA2E-01A-11D-A36X-10 0 0 0 0
#> TCGA-A3-3308-01A-01D-0966-08 0 0 0 0
#> TCGA-A3-3311-01A-01D-0966-08 0 0 0 0
一顿操作猛如虎,生成signature与样本对应关系的矩阵
if(!file.exists(paste0(proj,"_w.Rdata"))){
w=lapply(unique(a$Sample), function(i){
## signatures.cosmic signatures.nature2013
sample_1 = whichSignatures(tumor.ref = sigs.input,
signatures.ref = signatures.cosmic,
sample.id = i,
contexts.needed = TRUE,
tri.counts.method = 'default')
print(c(i,which(unique(a$Sample)==i)))
return(sample_1$weights)
})
w=do.call(rbind,w)
save(w,file = paste0(proj,"_w.Rdata"))
}
load(paste0(proj,"_w.Rdata"))
mat = t(w)
3.可视化
矩阵的可视化–热图
pheatmap::pheatmap(mat,
cluster_rows = F,
cluster_cols = T,
show_colnames = F)
任意基因的任意分组比较
任意基因的任意分组比较
0.输入数据
rm(list=ls())
proj = "TCGA-KIRC"
f = "explore.Rdata"
if(!file.exists(f)){
load(paste0(proj,".Rdata"))
load(paste0(proj,"_sur_model.Rdata"))
library(TCGAmutations)
library(maftools)
laml = tcga_load(study = "KIRC")
mut = laml@data
save(exp,Group,exprSet,meta,mut,file = f)
}
load(f)
这里面的数据:
exp是tumor-normal都有的表达矩阵,exprSet是只有tumor样本的表达矩阵。meta是临床信息表格,Group是tumor-normal分组信息。mut是突变信息,由maf文件读取并取子集得到。
1.比较任意基因在tumor和normal样本中的表达量
以PBRM1为例画图,可替换为其他任意基因。
table(Group)
#> Group
#> normal tumor
#> 72 535
library(ggstatsplot)
exp = log2(edgeR::cpm(exp)+1)
dat = data.frame(gene = exp["PBRM1",],
group = Group)
ggbetweenstats(data = dat, x = group, y = gene,title = "PBRM1")
2.任意基因在任意两个分组中的表达量对比
只要是可以根据临床信息查到或得到的分组,例如生死、人种、分期,都可以拿来做分组。
需要注意调整样本顺序,一一对应。
#按照生死、人种、分期分组看看
table(meta$event)
#>
#> 0 1
#> 344 169
table(meta$stage)
#>
#> I II III IV
#> 255 56 117 82
table(meta$race)
#>
#> asian black or african american white
#> 8 51 447
dat = data.frame(gene = exprSet["PBRM1",],
event = meta$event,
stage = meta$stage,
race = meta$race)
p1 = ggbetweenstats(data = dat, x = event, y = gene,title = "PBRM1")
p2 = ggbetweenstats(data = dat, x = stage, y = gene,title = "PBRM1")
p3 = ggbetweenstats(data = dat, x = race, y = gene,title = "PBRM1")
library(patchwork)
p1/p2/p3
3.根据某个基因是否突变分组比较某基因的表达量
dim(exprSet)
#> [1] 31798 513
mut[1:4,1:4]
#> Hugo_Symbol Chromosome Start_Position End_Position
#> 1 NMT2 10 15154934 15154934
#> 2 ARHGAP42 11 100845194 100845194
#> 3 RAG1 11 36597064 36597064
#> 4 NLRP14 11 7064166 7064166
library(stringr)
length(unique(str_sub(mut$Tumor_Sample_Barcode,1,16)))
#> [1] 370
k = colnames(exprSet) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16));table(k)
#> k
#> FALSE TRUE
#> 163 350
#513个样本中,有350个有突变信息记录,将这些样本对应的表达矩阵取出来。
expm = exprSet[,k]
library(dplyr)
VHL_mut = mut %>%
filter(Hugo_Symbol=='VHL') %>%
as.data.frame() %>%
pull(Tumor_Sample_Barcode) %>%
as.character() %>%
str_sub(1,16)
#false 是未突变样本,true是突变样本
tail(rownames(expm))
#> [1] "MT-ND5" "MT-ND6" "MT-TE" "MT-CYB" "MT-TT" "MT-TP"
dat=data.frame(gene=expm['PBRM1',],
mut= str_sub(colnames(expm),1,16) %in% VHL_mut)
ggbetweenstats(data = dat, x = mut, y = gene)
#可以计算每个基因的p值,找找是不是有显著的。
res <- t.test(gene ~ as.factor(mut), data = dat)
res$p.value
#> [1] 0.1138761
#自己试试能不能写出来这个循环?
任意基因的相关性分析
corralation
0.输入数据和R包
rm(list=ls())
library(ggpubr)
library(stringr)
load(file = "explore.Rdata")
与15_boxplot的输入数据相同
library(dplyr)
VHL_mut = mut %>%
filter(Hugo_Symbol=='VHL') %>%
as.data.frame() %>%
pull(Tumor_Sample_Barcode) %>%
as.character() %>%
str_sub(1,16)
2.任意两个基因的相关性分析
2.1 简单绘图
使用ggpbur。
dat=data.frame(gene1=exprSet['SETD2',],
gene2=exprSet['PBRM1',],
stage=meta$stage)
sp1 <- ggscatter(dat, x = "gene1", y = "gene2",
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
) + stat_cor(method = "pearson")
sp1
2.2 按照stage分组
不仅stage,任意在meta信息中能找到或生产的分组都可以。
sp2 <- ggscatter( dat, x = "gene1", y = "gene2",
color = "stage", palette = "jco",
add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))
sp2
2.3 按照是否突变来分组
理论上某个是否突变并不会改变某两个基因的相关性趋势,如果有这种特殊的突变,打乱了两个基因之间正常的相关关系,机制就有了。可以写循环试一下是否有这样的基因突变。
expm = exprSet[,str_sub(colnames(exprSet),1,16) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16))]
dat=data.frame(gene1=expm['SETD2',],
gene2=expm['PBRM1',],
mut= str_sub(colnames(expm),1,16) %in% VHL_mut)
sp3 <- ggscatter( dat, x = "gene1", y = "gene2",
color = "mut", palette = "jco",
add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = mut))
sp3
3.分面
sp2 + facet_wrap(~stage, scales = "free_x")
sp3 + facet_wrap(~mut, scales = "free_x")
分面的列有缺失值?导致了NA也占有一个子图。有两个选择,一个是换一列来分面,一个是作图数据中去除缺失值
library(tidyr)
dat=data.frame(gene1=exprSet['SETD2',],
gene2=exprSet['PBRM1',],
stage=meta$stage)
dat = drop_na(dat,stage)
ggscatter( dat, x = "gene1", y = "gene2",
color = "stage", palette = "jco",
add = "reg.line", conf.int = TRUE) + stat_cor(aes(color = stage))+
facet_wrap(~stage, scales = "free_x")