数据准备

1. 表达矩阵

  • 将行名转换为 gene symbol
  • 数值log2后取小数点后两位
  • 锁定11个基因
  1. suppressPackageStartupMessages(library(org.Hs.eg.db))
  2. suppressPackageStartupMessages(library(AnnotationDbi))
  3. expr <- read.table('./TCGA-HNSC.htseq_counts.tsv',header = T,sep='\t')
  4. dim(expr)
  5. # [1] 60488 547
  6. expr[1:4,1:4]
  7. # Ensembl_ID TCGA.BB.4224.01A TCGA.H7.7774.01A TCGA.CV.6943.01A
  8. # 1 ENSG00000000003.13 11.127994 11.420487 11.39178
  9. # 2 ENSG00000000005.5 1.584963 0.000000 0.00000
  10. # 3 ENSG00000000419.11 10.650154 10.724514 10.68825
  11. # 4 ENSG00000000457.12 10.055282 9.651052 9.84235
  12. enb <- as.character(expr[,1])
  13. enb_new <- c()
  14. for (i in 1:length(enb)) {
  15. enb_new[i] <- strsplit(enb[i],"\\.")[[1]][1]
  16. }
  17. enb_new <- unique(enb_new)
  18. symb <- AnnotationDbi::select(org.Hs.eg.db,enb_new,"SYMBOL","ENSEMBL")
  19. todelet <- match(symb[,1],enb_new )
  20. expr[,1] <- enb_new
  21. expr <- expr[todelet,]
  22. expr[,1] <- symb[,2]
  23. gs <- expr$Hugo_Symbol
  24. expr <- expr[,-1]
  25. rownames(expr) <- gs
  26. expr <- log2(expr+1)
  27. expr <- apply(expr, 2, function(x){
  28. as.numeric(format(x,digits = 2))
  29. })
  30. save(expr,file='expression.Rdata')

2. 临床数据

  • 将行名形式与表达矩阵统一
  1. clin <- read.table('./TCGA-HNSC.GDC_phenotype.tsv',header = T,sep='\t')
  2. summary(clin)
  3. clin[1:4,1:4]
  4. clin$submitter_id.samples = gsub('-','.',clin$submitter_id.samples)
  5. row.names(clin) <- clin[,1]
  6. save(clin,file='clinical.Rdata')

3. 生存数据

  1. surv <- read.table('./TCGA-HNSC.survival.tsv',header = T,sep='\t')

4. 整合数据

  • 锁定11个基因
  • 整合 clinsurv
  1. geneselected <- expr[c("CHAC2","CLEC9A","GNG10","JCHAIN","KLRB1",
  2. "NOG","OLR1","PRELID2","SYT1","VWCE","ZNF443"),]
  3. phe <- clin[,c("submitter_id.samples","age_at_initial_pathologic_diagnosis","clinical_stage")]
  4. colnames(phe) <- c("ID","age","stage")
  5. row.names(phe) <- phe[,1]
  6. row.names(surv) <- surv[,1]
  7. phe <- cbind(phe,surv[,c("OS.time","OS")])
  8. phe <- phe[phe$ID %in% surv$sample,]
  9. surv <- surv[surv$sample %in% phe$ID,]
  10. phe <- cbind(phe,surv[,c("OS.time","OS")])
  11. ## 去除 stage 中的空白值
  12. phe <- phe[phe$stage!="",]
  13. geneselected <- geneselected[,colnames(geneselected) %in% phe$ID]
  14. ## 更改数据框中所有列的属性,和示例数据一致
  15. phe$age <- as.numeric(phe$age)
  16. phe$stage <- as.character(phe$stage)
  17. phe$OS.time <- as.numeric(phe$OS.time)
  18. phe$OS <- as.numeric(phe$OS)

coxph

  1. mySurv=with(phe,Surv(OS.time, OS))
  2. cox_results <-apply(geneselected, 1 ,function(gene){
  3. group=ifelse(gene>median(gene),'high','low')
  4. survival_dat <- data.frame(group=group,stage=phe$stage,age=phe$age,
  5. stringsAsFactors = F)
  6. m=coxph(mySurv ~ age + stage+ group, data = survival_dat)
  7. beta <- coef(m)
  8. se <- sqrt(diag(vcov(m)))
  9. HR <- exp(beta)
  10. HRse <- HR * se
  11. tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
  12. HR = HR, HRse = HRse,
  13. HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
  14. HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
  15. HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
  16. return(tmp['grouplow',])
  17. })
  18. cox_results=t(cox_results)
  19. table(cox_results[,4]<0.05)
  20. #
  21. # FALSE TRUE
  22. # 10 1
  23. cox_results[cox_results[,4]<0.05,]
  24. # coef se z p HR HRse HRz HRp HRCILL HRCIUL
  25. # 0.398 0.193 2.058 0.040 1.489 0.288 1.698 0.090 1.019 2.176

TCGA-HNSC Cox regression - 图1

结果只有一个基因是显著的。

同时还有另一个问题(纠缠了大半天

当矩阵是全部基因的 expr 时,一直会出现报错,目前还未解决。

也是因为这个报错,删去了satge有空白值的项、更改了数据框每一列的属性,不过依旧报错,没有结果文件产生。

换成目标的11个基因的表达矩阵后,没有出现报错。

TCGA-HNSC Cox regression - 图2