Usage
Usage:
enricher(
gene,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
universe,
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.2,
TERM2GENE,
TERM2NAME = NA
)
Arguments:
gene: a vector of gene id
pvalueCutoff: pvalue cutoff on enrichment tests to report
pAdjustMethod: one of "holm", "hochberg", "hommel", "bonferroni", "BH",
"BY", "fdr", "none"
universe: background genes. If missing, the all genes listed in the
database (eg TERM2GENE table) will be used as background.
minGSSize: minimal size of genes annotated for testing
maxGSSize: maximal size of genes annotated for testing
qvalueCutoff: qvalue cutoff on enrichment tests to report as
significant. Tests must pass i) ‘pvalueCutoff’ on unadjusted
pvalues, ii) ‘pvalueCutoff’ on adjusted pvalues and iii)
‘qvalueCutoff’ on qvalues to be reported.
TERM2GENE: user input annotation of TERM TO GENE mapping, a data.frame
of 2 column with term and gene
TERM2NAME: user input of TERM TO NAME mapping, a data.frame of 2 column
with term and name
Example
#!/usr/bin/env Rscript
library(ggplot2)
library(optparse)
suppressMessages(library(showtext))
suppressMessages(library(clusterProfiler))
description <- "
\033[1;32mDifferential Enrichment Analysis and Plot for KEGG/GO\033[0m
"
# =========================================== 参数解析 =========================================================
option_list = list(
make_option(c('-b', '--background'), help = 'The background file contains geneid and function information'),
make_option(c('-g', '--diff_genes'), help = 'The different genes'),
make_option(c('-t', '--type'), help = 'The type of your function, choices: kegg, go'),
make_option(c('-x', '--ontology'), help = 'The ontology for GO enrichment, choices: BP, CC, MF'),
make_option(c('-o', '--outdir'), help = 'The output directory [default: %default]', default = '.'),
make_option(c('-n', '--number'), help = 'The max number used to plot [default: %default]', default = 10, type='integer'),
make_option(c('-p', '--pcutoff'), help = 'The pvalue(p.adjust) cutoff [default: %default]', default = 0.05, type='double'),
make_option(c('-m', '--method'), help = 'The adjustment method [default: %default]', default = 'BH'),
make_option(c('-T', '--title'), help = 'The title of the picture'),
make_option(c('-X', '--xlab'), help = 'The xlab of the picture'),
make_option(c('-Y', '--ylab'), help = 'The ylab of the picture'),
make_option(c('-W', '--width'), type = 'integer', help = 'The width of the picture'),
make_option(c('-H', '--height'), type = 'integer', help = 'The height of the picture')
)
parser <- OptionParser(description = description,
option_list = option_list,
epilogue = 'contact: suqingdong@novogene.com')
args <- parse_args(parser)
if (is.null(args$background) || is.null(args$diff_genes)) {
print_help(parser)
quit('no')
}
# ============================================================================================================
# # just for testing
# args = data.frame(diff_genes = 'diff_gene.list',
# background = 'geneid_kegg.tsv',
# pcutoff = 0.05,
# method = 'BH')
# print(args)
type = args$type
ontology = args$ontology
pcutoff = args$pcutoff
method = args$method
number = args$number
# showtext
# showtext_auto()
# font_add('Times New Roman', './times.ttf')
# 差异基因:每行一个基因ID
diff_genes <- read.table(args$diff_genes)[, 1]
# 背景基因:geneID <TAB> Term
bg_data <- read.table(args$background, sep = '\t')
colnames(bg_data) = c('geneid', 'term')
# term去重,避免重复获取
term_list <- unique(bg_data$term)
# type未指定是自动判断
if (is.null(args$type)) {
if (startsWith(term_list[1], 'GO:')) {
type = 'go'
} else {
type = 'kegg'
}
}
# 添加注释
if (type == 'kegg') {
ko_names <- ko2name(term_list)
merged_data <- merge(bg_data, ko_names, by.x = 'term', by.y = 'ko')
term2gene <- data.frame(merged_data$term, merged_data$geneid)
term2name <- data.frame(merged_data$term, merged_data$name)
} else {
go_terms <- go2term(term_list)
go_onts <- go2ont(term_list)
md <- merge(bg_data, go_terms, by.x = 'term', by.y = 'go_id')
merged_data <- merge(md, go_onts, by.x = 'term', by.y = 'go_id')
if( ! is.null(ontology) ) {
merged_data <- merged_data[merged_data$Ontology==ontology, ]
type <- paste(type, ontology, sep='.')
}
term2gene <- data.frame(merged_data$term, merged_data$geneid)
term2name <- data.frame(merged_data$term, merged_data$Term)
}
# 富集分析
result = enricher(
diff_genes,
TERM2GENE = term2gene,
TERM2NAME = term2name,
pvalueCutoff = pcutoff,
qvalueCutoff = 1,
pAdjustMethod = method # BH == FDR
)
# 结果转数据框
result_df = data.frame(result)
# 按p.adjust正序排序, 分值小的在上
result_df = result_df[order(result_df$p.adjust, decreasing = F), ]
# 保存文件
outfile = sprintf('%s.result.xls', type)
write.table(result_df, outfile, sep='\t', quote = F, row.names = F)
# 截取前N行
result_df = head(result_df, number)
# Ratio列转换成数值
calc_ratio = function(x) {
res = as.numeric(unlist(strsplit(x, "/")))
res[1] / res[2]
}
result_df$GeneRatio = unlist(lapply(result_df$GeneRatio, calc_ratio))
# Description列设置因子,用于控制显示顺序
# 按p.adjust逆序排序, 分值小的图片中在上
result_df = result_df[order(result_df$p.adjust, decreasing = T), ]
result_df$Description = factor(result_df$Description, levels=result_df$Description)
# 样式
family = 'Times New Roman'
size = 28
angle = 0
theme.style <- theme(
text = element_text(family = family),
axis.text = element_text(size = size),
axis.text.y = element_text(angle = angle),
)
# 绘图
# barplot
outbar = sprintf('%s.barplot.png', type)
barplot <-
ggplot(result_df, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = 'identity') +
theme_bw() +
theme.style +
scale_fill_gradient(low = "red", high = "blue")
ggsave(outbar, barplot, type = 'cairo')
sprintf('save file: %s', outbar)
# dotplot
outdot = sprintf('%s.dotplot.png', type)
dotplot <-
ggplot(result_df, aes(x=GeneRatio, y=Description, color=p.adjust, size=Count)) +
geom_point() +
scale_color_gradient(low="red", high="blue") +
theme_bw() +
theme.style
ggsave(outdot, dotplot, type='cairo')
sprintf('save file: %s', outdot)