数据准备
1. 表达矩阵
- 将行名转换为 gene symbol
- 数值log2后取小数点后两位
- 锁定11个基因
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(AnnotationDbi))
expr <- read.table('./TCGA-HNSC.htseq_counts.tsv',header = T,sep='\t')
dim(expr)
# [1] 60488 547
expr[1:4,1:4]
# Ensembl_ID TCGA.BB.4224.01A TCGA.H7.7774.01A TCGA.CV.6943.01A
# 1 ENSG00000000003.13 11.127994 11.420487 11.39178
# 2 ENSG00000000005.5 1.584963 0.000000 0.00000
# 3 ENSG00000000419.11 10.650154 10.724514 10.68825
# 4 ENSG00000000457.12 10.055282 9.651052 9.84235
enb <- as.character(expr[,1])
enb_new <- c()
for (i in 1:length(enb)) {
enb_new[i] <- strsplit(enb[i],"\\.")[[1]][1]
}
enb_new <- unique(enb_new)
symb <- AnnotationDbi::select(org.Hs.eg.db,enb_new,"SYMBOL","ENSEMBL")
todelet <- match(symb[,1],enb_new )
expr[,1] <- enb_new
expr <- expr[todelet,]
expr[,1] <- symb[,2]
gs <- expr$Hugo_Symbol
expr <- expr[,-1]
rownames(expr) <- gs
expr <- log2(expr+1)
expr <- apply(expr, 2, function(x){
as.numeric(format(x,digits = 2))
})
save(expr,file='expression.Rdata')
2. 临床数据
- 将行名形式与表达矩阵统一
clin <- read.table('./TCGA-HNSC.GDC_phenotype.tsv',header = T,sep='\t')
summary(clin)
clin[1:4,1:4]
clin$submitter_id.samples = gsub('-','.',clin$submitter_id.samples)
row.names(clin) <- clin[,1]
save(clin,file='clinical.Rdata')
3. 生存数据
surv <- read.table('./TCGA-HNSC.survival.tsv',header = T,sep='\t')
4. 整合数据
- 锁定11个基因
- 整合
clin
和surv
geneselected <- expr[c("CHAC2","CLEC9A","GNG10","JCHAIN","KLRB1",
"NOG","OLR1","PRELID2","SYT1","VWCE","ZNF443"),]
phe <- clin[,c("submitter_id.samples","age_at_initial_pathologic_diagnosis","clinical_stage")]
colnames(phe) <- c("ID","age","stage")
row.names(phe) <- phe[,1]
row.names(surv) <- surv[,1]
phe <- cbind(phe,surv[,c("OS.time","OS")])
phe <- phe[phe$ID %in% surv$sample,]
surv <- surv[surv$sample %in% phe$ID,]
phe <- cbind(phe,surv[,c("OS.time","OS")])
## 去除 stage 中的空白值
phe <- phe[phe$stage!="",]
geneselected <- geneselected[,colnames(geneselected) %in% phe$ID]
## 更改数据框中所有列的属性,和示例数据一致
phe$age <- as.numeric(phe$age)
phe$stage <- as.character(phe$stage)
phe$OS.time <- as.numeric(phe$OS.time)
phe$OS <- as.numeric(phe$OS)
coxph
mySurv=with(phe,Surv(OS.time, OS))
cox_results <-apply(geneselected, 1 ,function(gene){
group=ifelse(gene>median(gene),'high','low')
survival_dat <- data.frame(group=group,stage=phe$stage,age=phe$age,
stringsAsFactors = F)
m=coxph(mySurv ~ age + stage+ group, data = survival_dat)
beta <- coef(m)
se <- sqrt(diag(vcov(m)))
HR <- exp(beta)
HRse <- HR * se
tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
HR = HR, HRse = HRse,
HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
return(tmp['grouplow',])
})
cox_results=t(cox_results)
table(cox_results[,4]<0.05)
#
# FALSE TRUE
# 10 1
cox_results[cox_results[,4]<0.05,]
# coef se z p HR HRse HRz HRp HRCILL HRCIUL
# 0.398 0.193 2.058 0.040 1.489 0.288 1.698 0.090 1.019 2.176
结果只有一个基因是显著的。
同时还有另一个问题(纠缠了大半天
当矩阵是全部基因的 expr
时,一直会出现报错,目前还未解决。
也是因为这个报错,删去了satge有空白值的项、更改了数据框每一列的属性,不过依旧报错,没有结果文件产生。
换成目标的11个基因的表达矩阵后,没有出现报错。