差异分析后,上下调的基因做富集

    1. #单细胞差异分析后的普通的GO和KEGG注释
    2. rm(list = ls())
    3. library(Seurat)
    4. library(SeuratData)
    5. library(ggplot2)
    6. library(patchwork)
    7. library(dplyr)
    8. load(file = 'basic.sce.pbmc.Rdata')
    9. DimPlot(pbmc,
    10. reduction = 'umap',
    11. label = TRUE, pt.size = 0.5) + NoLegend()
    12. sce=pbmc
    13. levels(Idents(sce))
    14. sce = sce[, Idents(sce) %in%
    15. c( "FCGR3A+ Mono", "CD14+ Mono" )] # FCGR3A就是CD16
    16. levels(Idents(sce))
    17. markers_df <- FindMarkers(object = sce,
    18. ident.1 = 'FCGR3A+ Mono',
    19. ident.2 = 'CD14+ Mono',
    20. #logfc.threshold = 0,
    21. min.pct = 0.25)
    22. head(markers_df)
    23. #筛选avg_log2FC > 1
    24. cg_markers_df=markers_df[abs(markers_df$avg_log2FC) >1,]
    25. dim(cg_markers_df)
    26. DoHeatmap(subset(sce, downsample = 150),
    27. slot = 'counts',
    28. unique(rownames(cg_markers_df)),size=3)
    29. #做FCGR3A+ Mono 和CD14+ Mono 之间的差异分析
    30. deg=FindMarkers(object = sce,
    31. ident.1 = 'FCGR3A+ Mono',
    32. ident.2 = 'CD14+ Mono',
    33. test.use='MAST') #通常用MAST算法算deg
    34. head(deg)
    35. save(deg,file = 'deg-by-MAST-for-mono-2-cluster.Rdata')

    image.png
    图 deg

    gene_up=rownames(deg[deg$avg_log2FC>0,])
    gene_down=rownames(deg[deg$avg_log2FC < 0,])
    
    library(org.Hs.eg.db)
    #把SYMBOL转为ENTREZID,这里会损失一部分无法匹配到的
    gene_up=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,
                                                       keys = gene_up,
                                                       columns = 'ENTREZID',
                                                       keytype = 'SYMBOL')[,2]))
    gene_down=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,
                                                         keys = gene_down,
                                                         columns = 'ENTREZID',
                                                         keytype = 'SYMBOL')[,2]))
    
    #人家的包就是 entrez ID  设计
    library(clusterProfiler)
    #function里是kegg和GO以及GSEA分析
    source('functions.R')
    

    Function:

    ## KEGG pathway analysis
    ### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
    run_kegg <- function(gene_up,gene_down,pro='test'){
      library(ggplot2)
      gene_up=unique(gene_up)
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
      ##over-representation test
      #下面把3个基因集分开做超几何分布检验
      #首先是上调基因集
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa',
                          #universe    = gene_all,
                          pvalueCutoff = 0.9,
                          qvalueCutoff =0.9)
      head(kk.up)[,1:6]
      kk=kk.up
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.up.csv'))
    
      #下调基因集
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa',
                            #universe    = gene_all,
                            pvalueCutoff = 0.9,
                            qvalueCutoff =0.9)
      head(kk.down)[,1:6]
      kk=kk.down
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.down.csv'))
    
      #上下调合并后的基因集
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa',
                            pvalueCutoff = 0.05)
      head(kk.diff)[,1:6]
      kk=kk.diff
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.diff.csv'))
    
      kegg_up_dt <- as.data.frame(kk.up)
      kegg_down_dt <- as.data.frame(kk.down)
      kegg_diff_dt <- as.data.frame(kk.diff)
    
      up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
      down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
    
      #画图设置,这个图很丑,大家可以自行修改
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
    
      ggsave(g_kegg,filename = paste0(pro,'_kegg_up_down.png') )
    
      if(F){
        ##GSEA 
        #GSEA算法跟上面的使用差异基因集做超几何分布检验不一样
        kk_gse <- gseKEGG(geneList     = geneList,
                          organism     = 'hsa',
                          nPerm        = 1000,
                          minGSSize    = 20,
                          pvalueCutoff = 0.9,
                          verbose      = FALSE)
        head(kk_gse)[,1:6]
        gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
        gseaplot(kk_gse, 'hsa04110',title = 'Cell cycle') 
        kk=DOSE::setReadable(kk_gse, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
        tmp=kk@result
        write.csv(kk@result,paste0(pro,'_kegg.gsea.csv'))
    
    
        #这里找不到显著下调的通路,可以选择调整阈值,或者其它
        down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
        up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
    
        g_kegg=kegg_plot(up_kegg,down_kegg)
        print(g_kegg)
        ggsave(g_kegg,filename = paste0(pro,'_kegg_gsea.png'))
      }
    
    }
    
    ##GO database analysis 
    #做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解
    run_go <- function(gene_up,gene_down,pro='test'){
      gene_up=unique(gene_up)
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
      g_list=list(gene_up=gene_up,
                  gene_down=gene_down,
                  gene_diff=gene_diff)
    
      #因为go数据库非常多基因集,所以运行速度会很慢。
      if(T){
        go_enrich_results <- lapply( g_list , function(gene) {
          lapply( c('BP','MF','CC') , function(ont) {
            cat(paste('Now process ',ont ))
            ego <- enrichGO(gene          = gene,
                            #universe      = gene_all,
                            OrgDb         = org.Hs.eg.db,
                            ont           = ont ,
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.99,
                            qvalueCutoff  = 0.99,
                            readable      = TRUE)
    
            print( head(ego) )
            return(ego)
          })
        })
        save(go_enrich_results,file =paste0(pro, '_go_enrich_results.Rdata'))
    
      }
    
      #只有第一次需要运行,就保存结果哈,下次需要探索结果,就载入即可
      load(file=paste0(pro, '_go_enrich_results.Rdata'))
    
      n1= c('gene_up','gene_down','gene_diff')
      n2= c('BP','MF','CC') 
      for (i in 1:3){
        for (j in 1:3){
          fn=paste0(pro, '_dotplot_',n1[i],'_',n2[j],'.png')
          cat(paste0(fn,'\n'))
          png(fn,res=150,width = 1080)
          print( dotplot(go_enrich_results[[i]][[j]] ))
          dev.off()
        }
      }
    
    
    }
    
    kegg_plot <- function(up_kegg,down_kegg){
      dat=rbind(up_kegg,down_kegg)
      colnames(dat)
      dat$pvalue = -log10(dat$pvalue)
      dat$pvalue=dat$pvalue*dat$group 
    
      dat=dat[order(dat$pvalue,decreasing = F),]
    
      g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
        geom_bar(stat="identity") + 
        scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
        scale_x_discrete(name ="Pathway names") +
        scale_y_continuous(name ="log10P-value") +
        coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
        ggtitle("Pathway Enrichment") 
    }
    

    下面是对Function的解释:
    包装成函数的注释流程(可替换到成Function):

      pro='test'
      library(ggplot2)
      gene_up=unique(gene_up) #unique去重
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
    
      ##over-representation test
      #下面把3个基因集分开做超几何分布检验
    
      ##上调基因集
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa',
                          #universe    = gene_all,
                          pvalueCutoff = 0.9,
                          qvalueCutoff =0.9)
      head(kk.up)[,1:6]
      kk=kk.up
      dotplot(kk)
      kk=DOSE::setReadable(kk, 
                           OrgDb='org.Hs.eg.db',
                           keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.up.csv'))
    

    image.png
    图 上调基因集

      #下调基因集
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa',
                            #universe    = gene_all,
                            pvalueCutoff = 0.9,
                            qvalueCutoff =0.9)
      head(kk.down)[,1:6]
      kk=kk.down
      dotplot(kk)
      kk=DOSE::setReadable(kk, 
                           OrgDb='org.Hs.eg.db',
                           keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.down.csv'))
    

    image.png
    图 下调基因集

      #上下调合并后的基因集
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa',
                            pvalueCutoff = 0.05)
      head(kk.diff)[,1:6]
      kk=kk.diff
      dotplot(kk)
      kk=DOSE::setReadable(kk, 
                           OrgDb='org.Hs.eg.db',
                           keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.diff.csv'))
    

    image.png
    图 上下调合并的基因集

    kegg_up_dt <- as.data.frame(kk.up)
      kegg_down_dt <- as.data.frame(kk.down)
      kegg_diff_dt <- as.data.frame(kk.diff)
    
      up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
      down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
    
      #画图设置, 这个图很丑,大家可以自行修改。
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
    
      ggsave(g_kegg,filename = paste0(pro,'_kegg_up_down.png') )
    
    ##GO database analysis 
    #做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解
    run_go <- function(gene_up,gene_down,pro='test'){
      gene_up=unique(gene_up)
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
      g_list=list(gene_up=gene_up,
                  gene_down=gene_down,
                  gene_diff=gene_diff)
    
      #因为go数据库非常多基因集,所以运行速度会很慢。
      if(T){
        go_enrich_results <- lapply( g_list , function(gene) {
          lapply( c('BP','MF','CC') , function(ont) {
            cat(paste('Now process ',ont ))
            ego <- enrichGO(gene          = gene,
                            #universe     = gene_all,
                            OrgDb         = org.Hs.eg.db,
                            ont           = ont ,
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.99,
                            qvalueCutoff  = 0.99,
                            readable      = TRUE)
    
            print( head(ego) )
            return(ego)
          })
        })
        save(go_enrich_results,file =paste0(pro, '_go_enrich_results.Rdata'))
    
      }
    
      n1= c('gene_up','gene_down','gene_diff')
      n2= c('BP','MF','CC') 
      for (i in 1:3){
        for (j in 1:3){
          fn=paste0(pro, '_dotplot_',n1[i],'_',n2[j],'.png')
          cat(paste0(fn,'\n'))
          png(fn,res=150,width = 1080)
          print( dotplot(go_enrich_results[[i]][[j]] ))
          dev.off()
        }
      }
      kegg_plot <- function(up_kegg,down_kegg){
      dat=rbind(up_kegg,down_kegg)
      colnames(dat)
      dat$pvalue = -log10(dat$pvalue)
      dat$pvalue=dat$pvalue*dat$group 
    
      dat=dat[order(dat$pvalue,decreasing = F),]
    
      g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
        geom_bar(stat="identity") + 
        scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
        scale_x_discrete(name ="Pathway names") +
        scale_y_continuous(name ="log10P-value") +
        coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
        ggtitle("Pathway Enrichment") 
    }
    

    Function功能解释至此。


    #直接利用之前保存的脚本中的代码对正、负相关基因进行kegg富集分析
    run_kegg(gene_up,gene_down,pro='test')
    

    image.png

    #对正相关基因进行富集分析
    go <- enrichGO(gene_up, 
                   OrgDb = "org.Hs.eg.db",
                   ont="all") 
    library(ggplot2)
    library(stringr)
    barplot(go, 
            split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")+ 
      ggsave('gene_up_GO_all_barplot.png') 
    go=DOSE::setReadable(go, 
                         OrgDb='org.Hs.eg.db',
                         keyType='ENTREZID')
    write.csv( go@result,file = 'gene_up_GO_all_barplot.csv')
    
    #对负相关基因进行富集分析
    go <- enrichGO(gene_down, 
                   OrgDb = "org.Hs.eg.db", 
                   ont="all") 
    barplot(go, 
            split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")+ 
      ggsave('gene_down_GO_all_barplot.png') 
    go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
    write.csv( go@result,file = 'gene_down_GO_all_barplot.csv')