这里的无脑只是分了组之后,对目的基因没有要求的情况下
因为DESeq2也涉及阈值设定,不一定确保目的基因一定在UP或者DN的基因里
这里也只是打包了DEG和GO_KEGG

加载R包及数据载入

  1. library(DESeq2)
  2. library(ggplot2)
  3. library(ggrepel)
  4. library(pheatmap)
  5. library(clusterProfiler)
  6. library(org.Hs.eg.db)
  7. library(stringr)
  8. library(RColorBrewer)
  9. library(glmnet)
  10. library(ROCR)
  11. library(caret)
  12. library(timeROC)
  13. library(GSEABase)
  14. library(forcats)
  15. library(ggstance)
  16. library(RColorBrewer)
  17. load('Rdata/train_exp_cl.Rdata')
  18. group = factor(cl$risk1, levels = c('LowRisk', 'HighRisk'))

(一)差异分析

整理好表达矩阵exp和分组group,打包跑就行

  1. DEG_DESeq2 = function(exp, group){
  2. colData <- data.frame(row.names =colnames(exp),
  3. condition=group)
  4. dds <- DESeqDataSetFromMatrix(countData = exp,
  5. colData = colData,
  6. design = ~ condition)
  7. dds <- DESeq(dds)
  8. res <- results(dds, contrast = c("condition",rev(levels(group))))
  9. resOrdered <- res[order(res$pvalue),]
  10. DEG <- as.data.frame(resOrdered)
  11. DEG = na.omit(DEG)
  12. logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
  13. k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
  14. k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
  15. DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
  16. write.csv(DEG, file = 'outport/DEG_DESeq2.csv')
  17. }
  18. DEG_DESeq2(exp, group)
  19. DEG = read.csv('outport/DEG_DESeq2.csv')
  20. table(DEG$change)

(二)火山图

注释数据需要自己搞

  1. # 一些火山图上的注释数据
  2. logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
  3. DEG$label = ifelse(DEG$X == 'HMOX1', 'HMOX1', '')
  4. p1 = ggplot(DEG, aes(x=log2FoldChange, y=-log10(pvalue),color=change))+
  5. geom_point(alpha=0.5, size=1.8)+
  6. theme_set(theme_set(theme_bw(base_size=20)))+
  7. xlab("log2FC") + ylab("-log10(pvalue)")+
  8. scale_colour_manual(values = c("#377EB8",'grey', "#E41A1C"))+
  9. theme_classic()+
  10. theme(panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))+
  11. geom_hline(aes(yintercept = -log10(0.05)), linetype = 'dashed', size = 1)+
  12. geom_vline(aes(xintercept = logFC_cutoff), linetype = 'dashed',size = 1)+
  13. geom_vline(aes(xintercept = -logFC_cutoff), linetype = 'dashed', size = 1)+
  14. geom_label_repel(aes(label = label))
  15. p1

(三)热图

注释数据需要自己搞
想看画图代码直接拖到最后p2

  1. gene = c(DEG[DEG$change == 'UP','X'], DEG[DEG$change == 'DOWN','X'])
  2. data = log2(edgeR::cpm(exp)+1)[gene,]
  3. cl = cl[order(cl$risk1),]
  4. data = data[,rownames(cl)]
  5. table(DEG$change)
  6. table(cl$risk1)
  7. bk = c(seq(-1,-0.1,by=0.01),seq(0,1,by=0.01))
  8. annotation_col = data.frame(risk = rep(c('HighRisk','LowRisk'), c(80,80)))
  9. rownames(annotation_col) = rownames(cl)
  10. ann_colors = list(
  11. risk = c(HighRisk = "#E41A1C", LowRisk = "#377EB8")
  12. )
  13. p2 = pheatmap(data,
  14. scale = "row",
  15. show_colnames =F,
  16. show_rownames = F,
  17. cluster_cols = F,
  18. cluster_rows = F,
  19. color = c(colorRampPalette(colors = c("#377EB8","white"))(length(bk)/2),
  20. colorRampPalette(colors = c("white","#E41A1C"))(length(bk)/2)),
  21. breaks=bk,
  22. annotation_col = annotation_col,
  23. annotation_colors = ann_colors,
  24. gaps_row = 620,
  25. gaps_col = 80)

(四)GO/KEGG

我是把上调基因和下调基因分别跑了GO/KEGG
打包跑吧,一次把四个全跑了,需要联网

  1. rm(list = ls())
  2. DEG = read.csv('outport/DEG_DESeq2.csv')
  3. UP = DEG[DEG$change == 'UP','X']
  4. DN = DEG[DEG$change == 'DOWN', 'X']
  5. # GO/KEGG 需要entrezID
  6. UP <- bitr(UP, fromType = "SYMBOL",
  7. toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )
  8. DN <- bitr(DN, fromType = "SYMBOL",
  9. toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )
  10. UP = UP[,2]
  11. DN = DN[,2]
  12. GO_KEGG = function(gene, trend){
  13. enrich <- enrichGO(gene = gene,
  14. OrgDb='org.Hs.eg.db',
  15. ont="BP",
  16. pvalueCutoff=1,
  17. qvalueCutoff=1)
  18. GeneRatio <- as.numeric(lapply(strsplit(enrich$GeneRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2])))
  19. BgRatio <- as.numeric(lapply(strsplit(enrich$BgRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2]) ))
  20. enrich_factor <- GeneRatio/BgRatio
  21. out <- data.frame(enrich$ID,enrich$Description,enrich$GeneRatio,enrich$BgRatio,round(enrich_factor,2),enrich$pvalue,enrich$qvalue,enrich$geneID)
  22. colnames(out) <- c("ID","Description","GeneRatio","BgRatio","enrich_factor","pvalue","qvalue","geneID")
  23. out$GeneRatio = str_split(out$GeneRatio, '/', simplify = T)[,1]
  24. out$GeneRatio = as.numeric(out$GeneRatio)
  25. out = out[order(out$GeneRatio, decreasing = T),]
  26. write.csv(out,paste0('outport/hypoxia_BP_', trend, '.csv'))
  27. enrich <- enrichGO(gene = gene,
  28. OrgDb='org.Hs.eg.db',
  29. ont="MF",
  30. pvalueCutoff=1,
  31. qvalueCutoff=1)
  32. GeneRatio <- as.numeric(lapply(strsplit(enrich$GeneRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2])))
  33. BgRatio <- as.numeric(lapply(strsplit(enrich$BgRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2]) ))
  34. enrich_factor <- GeneRatio/BgRatio
  35. out <- data.frame(enrich$ID,enrich$Description,enrich$GeneRatio,enrich$BgRatio,round(enrich_factor,2),enrich$pvalue,enrich$qvalue,enrich$geneID)
  36. colnames(out) <- c("ID","Description","GeneRatio","BgRatio","enrich_factor","pvalue","qvalue","geneID")
  37. out$GeneRatio = str_split(out$GeneRatio, '/', simplify = T)[,1]
  38. out$GeneRatio = as.numeric(out$GeneRatio)
  39. out = out[order(out$GeneRatio, decreasing = T),]
  40. write.csv(out,paste0('outport/hypoxia_MF_', trend, '.csv'))
  41. enrich <- enrichGO(gene = gene,
  42. OrgDb='org.Hs.eg.db',
  43. ont="CC",
  44. pvalueCutoff=1,
  45. qvalueCutoff=1)
  46. GeneRatio <- as.numeric(lapply(strsplit(enrich$GeneRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2])))
  47. BgRatio <- as.numeric(lapply(strsplit(enrich$BgRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2]) ))
  48. enrich_factor <- GeneRatio/BgRatio
  49. out <- data.frame(enrich$ID,enrich$Description,enrich$GeneRatio,enrich$BgRatio,round(enrich_factor,2),enrich$pvalue,enrich$qvalue,enrich$geneID)
  50. colnames(out) <- c("ID","Description","GeneRatio","BgRatio","enrich_factor","pvalue","qvalue","geneID")
  51. out$GeneRatio = str_split(out$GeneRatio, '/', simplify = T)[,1]
  52. out$GeneRatio = as.numeric(out$GeneRatio)
  53. out = out[order(out$GeneRatio, decreasing = T),]
  54. write.csv(out,paste0('outport/hypoxia_CC_', trend, '.csv'))
  55. enrich <- enrichKEGG(gene = gene,
  56. organism='hsa',
  57. pvalueCutoff=1,
  58. qvalueCutoff=1)
  59. GeneRatio <- as.numeric(lapply(strsplit(enrich$GeneRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2])))
  60. BgRatio <- as.numeric(lapply(strsplit(enrich$BgRatio,split="/"),function(x) as.numeric(x[1])/as.numeric(x[2]) ))
  61. enrich_factor <- GeneRatio/BgRatio
  62. out <- data.frame(enrich$ID,enrich$Description,enrich$GeneRatio,enrich$BgRatio,round(enrich_factor,2),enrich$pvalue,enrich$qvalue,enrich$geneID)
  63. colnames(out) <- c("ID","Description","GeneRatio","BgRatio","enrich_factor","pvalue","qvalue","geneID")
  64. out$GeneRatio = str_split(out$GeneRatio, '/', simplify = T)[,1]
  65. out$GeneRatio = as.numeric(out$GeneRatio)
  66. out = out[order(out$GeneRatio, decreasing = T),]
  67. write.csv(out,paste0('outport/hypoxia_KEGG_', trend, '.csv'))
  68. }
  69. trend = 'UP'
  70. GO_KEGG(UP, trend)
  71. trend = 'DN'
  72. GO_KEGG(DN, trend)

可视化:

  1. ## UP (risk factor)
  2. out_bp = read.csv('outport/hypoxia_BP_UP.csv')
  3. out_mf = read.csv('outport/hypoxia_MF_UP.csv')
  4. out_cc = read.csv('outport/hypoxia_CC_UP.csv')
  5. out_kegg = read.csv('outport/hypoxia_KEGG_UP.csv')
  6. out = rbind(out_bp[1:10,],
  7. out_mf[1:10,],
  8. out_cc[1:10,],
  9. out_kegg[1:10,])
  10. out$group = rep(c('BP','MF','CC','KEGG'), c(10,10,10,10))
  11. out$Description = factor(out$Description, levels = out$Description)
  12. color = brewer.pal(n=4, 'Set2')
  13. p3 = ggplot(out, aes(x=Description, y=GeneRatio)) +
  14. geom_bar(aes(size = GeneRatio, fill = group), stat = "identity") +
  15. coord_flip() +
  16. xlab("") +
  17. theme_bw()+
  18. theme(axis.text.x = element_blank())+
  19. scale_x_discrete(labels=function(x) str_wrap(x, width=50))+
  20. theme_classic()+
  21. theme(panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))+
  22. scale_fill_manual(values = c("#66C2A5","#FC8D62","#8DA0CB","#E78AC3"))+
  23. theme(axis.text.y = element_text(colour = rep(c("#66C2A5","#E78AC3","#FC8D62","#8DA0CB"),
  24. c(10,10,10,10))))
  25. p3
  26. ## DN (protective factor)
  27. out_bp = read.csv('outport/hypoxia_BP_DN.csv')
  28. out_mf = read.csv('outport/hypoxia_MF_DN.csv')
  29. out_cc = read.csv('outport/hypoxia_CC_DN.csv')
  30. out_kegg = read.csv('outport/hypoxia_KEGG_DN.csv')
  31. out = rbind(out_bp[1:10,],
  32. out_mf[1:10,],
  33. out_cc[1:10,],
  34. out_kegg[1:10,])
  35. out$group = rep(c('BP','MF','CC','KEGG'), c(10,10,10,10))
  36. out$Description = factor(out$Description, levels = out$Description)
  37. color = brewer.pal(n=4, 'Set2')
  38. p4 = ggplot(out, aes(x=Description, y=GeneRatio)) +
  39. geom_bar(aes(size = GeneRatio, fill = group), stat = "identity") +
  40. coord_flip() +
  41. xlab("") +
  42. theme_bw()+
  43. theme(axis.text.x = element_blank())+
  44. scale_x_discrete(labels=function(x) str_wrap(x, width=50))+
  45. theme_classic()+
  46. theme(panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))+
  47. scale_fill_manual(values = c("#66C2A5","#FC8D62","#8DA0CB","#E78AC3"))+
  48. theme(axis.text.y = element_text(colour = rep(c("#66C2A5","#E78AC3","#FC8D62","#8DA0CB"),
  49. c(10,10,10,10))))
  50. p4

(五)GSEA

GSEA是基于logFC的,DESeq2之后,不要提取差异基因,而是拿着所有基因的logFC跑GSEA就可以
数据集下载地址:https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp#C5
我跑的是所有GO数据集,会得到一个gmt文件
image.png
我依据分组情况,跑了其中分组为HH和LH的两组的GSEA
输入数据准备

  1. rm(list = ls())
  2. load('Rdata/train_exp_cl.Rdata')
  3. dattrait = cl[cl$group %in% c('HL','LL'),]
  4. datexpr = exp[,rownames(dattrait)]
  5. identical(rownames(dattrait), colnames(datexpr))

DESeq2 代码同(一),命名那里我改了一下,因为不改会覆盖原来的DEG文件

  1. DEG_DESeq2 = function(exp, group){
  2. colData <- data.frame(row.names =colnames(exp),
  3. condition=group)
  4. dds <- DESeqDataSetFromMatrix(countData = exp,
  5. colData = colData,
  6. design = ~ condition)
  7. dds <- DESeq(dds)
  8. res <- results(dds, contrast = c("condition",rev(levels(group))))
  9. resOrdered <- res[order(res$pvalue),]
  10. DEG <- as.data.frame(resOrdered)
  11. DEG = na.omit(DEG)
  12. logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
  13. k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
  14. k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
  15. DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
  16. write.csv(DEG, file = 'outport/DEG_DESeq2_HH_LH.csv')
  17. }
  18. group = factor(dattrait$risk11, levels = c('HL','LL'))
  19. DEG_DESeq2(datexpr, group)
  20. DEG = read.csv('outport/DEG_DESeq2_HH_LH.csv')

GSEA结果处理

  1. geneset = read.gmt('import/c5.go.v7.4.symbols.gmt')
  2. geneList = DEG$log2FoldChange
  3. names(geneList) = DEG$X
  4. geneList = sort(geneList, decreasing = T)
  5. egmt <- GSEA(geneList,
  6. TERM2GENE=geneset,
  7. verbose=T,
  8. pvalueCutoff = 1)
  9. gsea <- as.data.frame(egmt@result)
  10. gsea = gsea[gsea$p.adjust<0.01,]
  11. up = gsea[gsea$NES > 0,]
  12. dn = gsea[gsea$NES < 0,]
  13. # 太多了,就分别取了前十个作图
  14. up = up[order(up$NES, decreasing = T),]
  15. dn = dn[order(dn$NES),]
  16. x = rbind(dn[1:10,], up[1:10,])

可视化

  1. x$Description = factor(x$Description, levels = x$Description)
  2. ggplot(x, aes(NES, Description, fill = p.adjust))+
  3. geom_barh(stat = 'identity')+
  4. scale_fill_continuous(low = "#FC8D62", high = "#66C2A5")+
  5. theme_classic()+
  6. ylab('')+
  7. theme(panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))

成品 ↓
image.png
image.png