文章:

Cell. 2018 May : Chemoresistance Evolution in Triple-Negative Breast Cancer Delineated by Single-Cell Sequencing.
> 基于基因集的生存分析 - 图1

从MSigDB数据库得到6个基因集

确认PMID后下载基因集:基于基因集的生存分析 - 图2

下载METABRIC表达矩阵

Breast Cancer (METABRIC, Nature 2012 & Nat Commun 2016)
导出到Motrix里可以以2M/S的速度解决墙内下载问题。


1 读入 data_clinical_patient.txt

  1. clin <- read.table('./brca_metabric/data_clinical_patient.txt',header = T,sep='\t')
  2. summary(clin)
  3. save(clin,file='metabric_clinical.Rdata')

2 读入 data_expression_median.txt

  1. expr <- read.table('./brca_metabric/data_expression_median.txt',header = T,sep='\t')
  2. dim(expr)
  3. expr[1:4,1:4]
  4. head(colnames(expr),10)
  5. length(unique(expr$Hugo_Symbol))
  6. gs <- expr$Hugo_Symbol
  7. expr <- expr[,-c(1,2)]

取小数点前两位

  1. expr <- apply(expr, 2, function(x){
  2. as.numeric(format(x,digits = 2))
  3. })

行名重命名及去除NA值

  1. rownames(expr) <- gs
  2. expr <- na.omit(expr)
  3. save(expr,file='metabric_expression.Rdata')

将’-‘替换为’.’,与表达矩阵的列名一致

  1. clin$PATIENT_ID = gsub('-','.',clin$PATIENT_ID)
  2. phe <- clin[match(colnames(expr),clin$PATIENT_ID),]

3 GSVA

  1. library(ggplot2)
  2. library(clusterProfiler)
  3. library(org.Hs.eg.db)
  4. library(GSVA)
  5. library(GSEABase)
  6. library(dplyr)
  7. rm(list = ls())
  8. d <- './gmt/'
  9. gmts <- list.files(d)
  10. gmtfile <- lapply(file.path(d,gmts),getGmt)
  1. st <- Sys.time()
  2. for(i in 1:length(gmtfile)){
  3. assign(paste0("es_max_", i), gsva(expr,gmtfile[[i]],
  4. mx.diff=FALSE, verbose=FALSE,
  5. parallel.sz=1))
  6. }
  7. et <- Sys.time()
  8. et - st
  9. # Time difference of 7.654427 hours
  10. es_max <- rbind(es_max_1,es_max_2,es_max_3,es_max_4,es_max_5,es_max_6,es_max_7,es_max_8,
  11. es_max_9,es_max_10,es_max_11,es_max_12,es_max_13)
  12. es_max[1:4,1:4]
  13. save(es_max,file = "GSVA_es_max.Rdata")

4 Survival Analysis

A. 选病人(根据是否化疗)

Next, we used an extended cohort of breast cancer patients with chemotherapy (n = 412) from the METABRIC cohort (Curtis et al., 201230365-9?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867418303659%3Fshowall%3Dtrue#), Pereira et al., 201630365-9?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867418303659%3Fshowall%3Dtrue#)) with gene expression data and long-term clinical follow-up data to determine if any of the chemoresistance-associated signatures correlated with patient survival.

  1. table(clin$CHEMOTHERAPY)
  2. patients_selected<- clin[clin$CHEMOTHERAPY=="YES",]
  3. dat <- t(es_max)
  4. patients_selected$PATIENT_ID = gsub('-','.',patients_selected$PATIENT_ID)
  5. row.names(patients_selected) <- patients_selected$PATIENT_ID
  6. OSinfo <- patients_selected[,c("OS_MONTHS","OS_STATUS")]
  7. dat2 <- dat[row.names(dat) %in% row.names(OSinfo),] %>% na.omit()
  8. OSinfo <- OSinfo[row.names(dat2),]

B. 选病人(根据 claudin subtype)

  1. table(clin$CLAUDIN_SUBTYPE)
  2. patients_selected_1 <- filter(clin,CLAUDIN_SUBTYPE == "Basal")
  3. patients_selected_2 <- filter(clin,CLAUDIN_SUBTYPE == "claudin-low")
  4. patients_selected_claudin <- rbind(patients_selected_1, patients_selected_2)
  5. dat <- t(es_max)
  6. patients_selected_claudin$PATIENT_ID <- gsub('-','.',patients_selected_claudin$PATIENT_ID)
  7. row.names(patients_selected_claudin) <- patients_selected_claudin$PATIENT_ID
  8. OSinfo <- patients_selected_claudin[,c("OS_MONTHS","OS_STATUS")]
  9. dat2 <- dat[row.names(dat) %in% row.names(OSinfo),] %>% na.omit()
  10. OSinfo <- OSinfo[row.names(dat2),]

合并分组信息

  1. dat2 <- cbind(dat2,OSinfo)
  2. library(reshape2)
  3. dat3 <- melt(dat2,c("OS_MONTHS","OS_STATUS"))
  4. dat3$OS_STATUS <- ifelse(dat3$OS_STATUS=="DECEASED", 0, 1)
  5. colnames(dat3)[3] <- "geneset"
  6. save(dat3,file = "df_survivalplot_chem.Rdata")
  7. ## save(dat3,file = "df_survivalplot_claudin.Rdata")

作图

各个基因集分别作图

  1. library(survival)
  2. library(survminer)
  3. gs <- split(dat3,dat3$geneset)
  4. for (i in 1:length(gs)) {
  5. x=gs[[i]]
  6. x <- filter(x,abs(value) >= 0.1)
  7. x$group <- ifelse(x$value >= 0.1, "High","Low")
  8. table(x$group)
  9. mySurv_OS=with(x,Surv(OS_MONTHS, OS_STATUS))
  10. sfit=survfit(mySurv_OS~group,data=x)
  11. print(ggsurvplot(sfit,pval =TRUE,title=x$geneset))
  12. ggsave(paste0(i,'.png'))
  13. }

A. 根据是否化疗挑选病人

image.png

B. 根据 claudin subtype 挑选病人

image.png

合并具有UP和DOWN的基因集再画图

  1. dat3$geneset <- ifelse(grepl("AKT1",dat3$geneset)==TRUE,"AKT1_SIGNALING",
  2. ifelse(grepl("EMT",dat3$geneset)==TRUE,"EMT",
  3. ifelse(grepl("ECM",dat3$geneset)==TRUE,"ECM_AFFILIATED",
  4. ifelse(grepl("HYPOXIA",dat3$geneset)==TRUE,"HYPOXIA",
  5. ifelse(grepl("ANGIOGENESIS",dat3$geneset)==TRUE,"ANGIOGENESIS","CDH1_TARGETS")))))
  1. ### 根据是否化疗挑选病人 ###
  2. table(dat3$geneset)
  3. #
  4. # AKT1_SIGNALING ANGIOGENESIS CDH1_TARGETS ECM_AFFILIATED EMT HYPOXIA
  5. # 792 396 2376 396 792 396
  6. ### 根据 claudin subtype 挑选病人###
  7. table(dat3$geneset)
  8. #
  9. # AKT1_SIGNALING ANGIOGENESIS CDH1_TARGETS ECM_AFFILIATED
  10. # 796 398 2388 398
  11. # EMT HYPOXIA
  12. # 796 398
  1. gs2 <- split(dat3,dat3$geneset)
  2. for (i in 1:length(gs2)) {
  3. x=gs2[[i]]
  4. x$group <- ifelse(x$value > median(x$value), "High","Low")
  5. table(x$group)
  6. mySurv_OS=with(x,Surv(OS_MONTHS, OS_STATUS))
  7. sfit=survfit(mySurv_OS~group,data=x)
  8. print(ggsurvplot(sfit,pval =TRUE,title=x$geneset))
  9. ggsave(paste0(i,'_1.png'))
  10. }

A. 根据是否化疗挑选病人

基于基因集的生存分析 - 图5

B. 根据 claudin subtype 挑选病人

基于基因集的生存分析 - 图6