DESeq2常用于识别差异基因,它主要使用了标准化因子标准化数据,再根据广义线性模型判别组间差异(组间残差是否显著判断)。在获取差异基因结果后,我们可以进行下一步的富集分析,常用方法有基于在线网站DAVID以及脚本处理的两类,本文介绍基于fgsea的方法计算富集分析得分。更多知识分享请到 https://zouhua.top/

DEseq2

可参考个人博客差异表达分析之Deseq2了解DESeq2如何标准化数据和识别差异基因。下面给出简要代码

  1. library(DESeq2)
  2. library(airway)
  3. data("airway")
  4. ddsSE <- DESeqDataSet(airway, design = ~ cell + dex)
  5. ddsSE <- DESeq(ddsSE)
  6. res <- results(ddsSE, tidy = TRUE) %>% na.omit() %>% as_tibble()
  7. head(res)
  1. # A tibble: 6 x 7
  2. row baseMean log2FoldChange lfcSE stat pvalue padj
  3. <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
  4. 1 ENSG00000000003 709. 0.381 0.101 3.79 0.000152 0.00128
  5. 2 ENSG00000000419 520. -0.207 0.112 -1.84 0.0653 0.197
  6. 3 ENSG00000000457 237. -0.0379 0.143 -0.264 0.792 0.911
  7. 4 ENSG00000000460 57.9 0.0882 0.287 0.307 0.759 0.895
  8. 5 ENSG00000000971 5817. -0.426 0.0883 -4.83 0.00000138 0.0000182
  9. 6 ENSG00000001036 1282. 0.241 0.0887 2.72 0.00658 0.0328

转换geneID

我们使用的MSigDB数据库的pathway 基因ID只有entrez和HGNC symbol两类,如果是ensemble id,需要转换

  1. library(org.Hs.eg.db)
  2. library(tidyverse)
  3. ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
  4. key=res$row,
  5. columns="SYMBOL",
  6. keytype="ENSEMBL")
  7. ens2symbol <- as_tibble(ens2symbol)
  8. head(ens2symbol)
  1. # A tibble: 6 x 2
  2. ENSEMBL SYMBOL
  3. <chr> <chr>
  4. 1 ENSG00000000003 TSPAN6
  5. 2 ENSG00000000419 DPM1
  6. 3 ENSG00000000457 SCYL3
  7. 4 ENSG00000000460 C1orf112
  8. 5 ENSG00000000971 CFH
  9. 6 ENSG00000001036 FUCA2
  • 合并数据;过滤NA值;去重;重复基因求stat(stat数据作为排序指标用于后续富集分析)
  1. res2 <- inner_join(res, ens2symbol, by=c("row"="ENSEMBL")) %>%
  2. dplyr::select(SYMBOL, stat) %>%
  3. na.omit() %>%
  4. distinct() %>%
  5. group_by(SYMBOL) %>%
  6. summarize(stat=mean(stat))
  7. head(res2 )
  1. # A tibble: 6 x 2
  2. SYMBOL stat
  3. <chr> <dbl>
  4. 1 A1BG 0.680
  5. 2 A1BG-AS1 -1.79
  6. 3 A2M -1.26
  7. 4 A2M-AS1 0.875
  8. 5 A4GALT -4.14
  9. 6 A4GNT 0.00777

构建fgsea输入数据

  • 基因排序值转换
  1. library(fgsea)
  2. ranks <- deframe(res2)
  3. head(ranks, 20)
  1. A1BG A1BG-AS1 A2M A2M-AS1 A4GALT A4GNT AAAS AACS
  2. 0.679946437 -1.793291412 -1.259539478 0.875346116 -4.144839902 0.007772497 0.163986128 1.416071728
  3. AADACL4 AADAT AAGAB AAK1 AAMDC AAMP AAR2 AARS1
  4. -1.876311694 3.079128034 1.554279946 1.141522348 -2.147527241 -3.170612332 -2.364380163 4.495474603
  5. AARS2 AARSD1 AASDH AASDHPPT
  6. 5.057470292 0.654208006 0.665531695 -0.353496148
  • pathways的基因集合,上MSigDB下载基因集。演示使用KEGG基因集
  1. pathways.hallmark <- gmtPathways("../../Result/GeneID/msigdb.v7.1.symbols_KEGG.gmt")
  2. pathways.hallmark %>%
  3. head() %>%
  4. lapply(head)
  1. $KEGG_GLYCOLYSIS_GLUCONEOGENESIS
  2. [1] "ACSS2" "GCK" "PGK2" "PGK1" "PDHB" "PDHA1"
  3. $KEGG_CITRATE_CYCLE_TCA_CYCLE
  4. [1] "IDH3B" "DLST" "PCK2" "CS" "PDHB" "PCK1"
  5. $KEGG_PENTOSE_PHOSPHATE_PATHWAY
  6. [1] "RPE" "RPIA" "PGM2" "PGLS" "PRPS2" "FBP2"
  7. $KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
  8. [1] "UGT1A10" "UGT1A8" "RPE" "UGT1A7" "UGT1A6" "UGT2B28"
  9. $KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM
  10. [1] "MPI" "PMM2" "PMM1" "FBP2" "PFKM" "GMDS"
  11. $KEGG_GALACTOSE_METABOLISM
  12. [1] "GCK" "GALK1" "GLB1" "GALE" "B4GALT1" "PGM2"
  • 运行
  1. fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
  2. head(fgseaRes[order(pval), ])
  • 从查看KEGG_REGULATION_OF_ACTIN_CYTOSKELETON富集分数分布
  1. plotEnrichment(pathways.hallmark[["KEGG_REGULATION_OF_ACTIN_CYTOSKELETON"]],
  2. ranks) + labs(title="KEGG_REGULATION_OF_ACTIN_CYTOSKELETON")

数据分析:基于DESeq2结果的基因富集分析 - 图1

  • 查看上下调通路结果
  1. topPathwaysUp <- fgseaRes[ES > 0][head(order(pval), n=10), pathway]
  2. topPathwaysDown <- fgseaRes[ES < 0][head(order(pval), n=10), pathway]
  3. topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
  4. plotGseaTable(pathways.hallmark[topPathways], ranks, fgseaRes,
  5. gseaParam=0.5)

数据分析:基于DESeq2结果的基因富集分析 - 图2

  • 其他展示方式
  1. fgseaResTidy <- fgseaRes %>%
  2. as_tibble() %>%
  3. arrange(desc(NES))
  4. # Show in a nice table:
  5. fgseaResTidy %>%
  6. dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%
  7. arrange(padj) %>%
  8. DT::datatable()
  9. ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
  10. geom_col(aes(fill = padj<0.0001)) +
  11. coord_flip() +
  12. labs(x="Pathway", y="Normalized Enrichment Score",
  13. title="Hallmark pathways NES from GSEA") +
  14. theme_minimal()

数据分析:基于DESeq2结果的基因富集分析 - 图3

查看通路的基因

  1. res_temp <- inner_join(res, ens2symbol, by=c("row"="ENSEMBL"))
  2. pathways.hallmark %>%
  3. enframe("pathway", "SYMBOL") %>%
  4. unnest(cols = c(SYMBOL)) %>%
  5. inner_join(res_temp , by="SYMBOL") %>%
  6. head()
  1. # A tibble: 6 x 9
  2. pathway SYMBOL row baseMean log2FoldChange lfcSE stat pvalue padj
  3. <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
  4. 1 KEGG_GLYCOLYSIS_GLUCONEOGENESIS ACSS2 ENSG00000131069 669. -0.269 0.114 -2.35 0.0188 0.0756
  5. 2 KEGG_GLYCOLYSIS_GLUCONEOGENESIS GCK ENSG00000106633 28.8 0.305 0.374 0.815 0.415 0.662
  6. 3 KEGG_GLYCOLYSIS_GLUCONEOGENESIS PGK1 ENSG00000102144 7879. -0.300 0.353 -0.850 0.395 0.642
  7. 4 KEGG_GLYCOLYSIS_GLUCONEOGENESIS PDHB ENSG00000168291 648. -0.257 0.102 -2.52 0.0117 0.0521
  8. 5 KEGG_GLYCOLYSIS_GLUCONEOGENESIS PDHA1 ENSG00000131828 651. -0.0744 0.104 -0.715 0.475 0.710
  9. 6 KEGG_GLYCOLYSIS_GLUCONEOGENESIS PGM2 ENSG00000169299 302. -0.315 0.136 -2.33 0.0201 0.0797

其他用法

  • miR targets
  1. fgsea(pathways=gmtPathways("msigdb/c3.mir.v6.2.symbols.gmt"), ranks, nperm=1000) %>%
  2. as_tibble() %>%
  3. arrange(padj)
  • GO annotations
  1. fgsea(pathways=gmtPathways("msigdb/c5.all.v6.2.symbols.gmt"), ranks, nperm=1000) %>%
  2. as_tibble() %>%
  3. arrange(padj)
  • 非人物种
  1. library(biomaRt)
  2. mart <- useDataset("mmusculus_gene_ensembl", mart=useMart("ensembl"))
  3. bm <- getBM(attributes=c("ensembl_gene_id", "hsapiens_homolog_associated_gene_name"), mart=mart) %>%
  4. distinct() %>%
  5. as_tibble() %>%
  6. na_if("") %>%
  7. na.omit()
  8. bm

参考

  1. Fast Gene Set Enrichment Analysis
  2. DESeq results to pathways in 60 Seconds with the fgsea package

参考文章如引起任何侵权问题,可以与我联系,谢谢。