image.png

流程

image.png

数据分析可视化

image.pngimage.png

本地分析脚本

  1. rm(list = ls())
  2. options(stringsAsFactors = F)
  3. # 加载包
  4. library(edgeR)
  5. library(ggplot2)
  6. # 读取基因表达矩阵信息并查看分组信息和表达矩阵数据
  7. lname <- load(file = "data/Step01-airwayData.Rdata");lname
  8. # 表达谱
  9. filter_count[1:4,1:4]
  10. # 分组信息
  11. group_list <- group[match(colnames(filter_count),group$run_accession),2]
  12. group_list
  13. comp <- unlist(strsplit("Dex_vs_untreated",split = "_vs_"))
  14. group_list <- factor(group_list,levels = comp)
  15. group_list
  16. table(group_list)
  17. # 构建线性模型。0代表x线性模型的截距为0
  18. design <- model.matrix(~0+group_list)
  19. rownames(design) <- colnames(filter_count)
  20. colnames(design) <- levels(factor(group_list))
  21. design
  22. # 构建edgeR的DGEList对象
  23. DEG <- DGEList(counts=filter_count,
  24. group=factor(group_list))
  25. # 归一化基因表达分布
  26. DEG <- calcNormFactors(DEG)
  27. # 计算线性模型的参数
  28. DEG <- estimateGLMCommonDisp(DEG,design)
  29. DEG <- estimateGLMTrendedDisp(DEG, design)
  30. DEG <- estimateGLMTagwiseDisp(DEG, design)
  31. # 拟合线性模型
  32. fit <- glmFit(DEG, design)
  33. # 进行差异分析
  34. lrt <- glmLRT(fit, contrast=c(1,-1))
  35. # 提取过滤差异分析结果
  36. DEG_edgeR <- as.data.frame(topTags(lrt, n=nrow(DEG)))
  37. head(DEG_edgeR)
  38. # 筛选上下调,设定阈值
  39. fc_cutoff <- 1.5
  40. pvalue <- 0.05
  41. DEG_edgeR$regulated <- "normal"
  42. loc_up <- intersect(which( DEG_edgeR$logFC > log2(fc_cutoff) ),
  43. which( DEG_edgeR$PValue < pvalue) )
  44. loc_down <- intersect(which(DEG_edgeR$logFC < (-log2(fc_cutoff))),
  45. which(DEG_edgeR$PValue<pvalue))
  46. DEG_edgeR$regulated[loc_up] <- "up"
  47. DEG_edgeR$regulated[loc_down] <- "down"
  48. table(DEG_edgeR$regulated)
  49. ## 添加一列gene symbol
  50. # 方法1:使用包
  51. library(org.Hs.eg.db)
  52. keytypes(org.Hs.eg.db)
  53. library(clusterProfiler)
  54. id2symbol <- bitr(rownames(DEG_edgeR),
  55. fromType = "ENSEMBL",
  56. toType = "SYMBOL",
  57. OrgDb = org.Hs.eg.db)
  58. head(id2symbol)
  59. DEG_edgeR <- cbind(GeneID=rownames(DEG_edgeR),DEG_edgeR)
  60. DEG_edgeR_symbol <- merge(id2symbol,DEG_edgeR,
  61. by.x="ENSEMBL",by.y="GeneID",all.y=T)
  62. head(DEG_edgeR_symbol)
  63. # 方法2:gtf文件中得到的id与name关系
  64. # Assembly: GRCh37(hg19) Release: ?
  65. # 使用上课测试得到的count做
  66. # 选择显著差异表达的结果
  67. library(tidyverse)
  68. DEG_edgeR_symbol_Sig <- filter(DEG_edgeR_symbol,regulated!="normal")
  69. # 保存
  70. write.csv(DEG_edgeR_symbol,"result/4.DEG_edgeR_all.csv", row.names = F)
  71. write.csv(DEG_edgeR_symbol_Sig,"result/4.DEG_edgeR_Sig.csv", row.names = F)
  72. save(DEG_edgeR_symbol,file = "data/Step03-edgeR_nrDEG.Rdata")
  73. ##====== 检查是否上下调设置错了
  74. # 挑选一个差异表达基因
  75. head(DEG_edgeR_symbol_Sig)
  76. exp <- c(t(express_cpm[match("ENSG00000001626",rownames(express_cpm)),]))
  77. test <- data.frame(value=exp, group=group_list)
  78. ggplot(data=test,aes(x=group,y=value,fill=group)) + geom_boxplot()

服务器分析传参脚本

image.png
原因:
1,可以在服务器上解决数据分析,实现环境统一
2,可以批量跑很多方案
脚本

## 帮助文档
library(getopt)
spec <- matrix(c(    
  'help',   'h',  0,  "logical",
  'count',  'c',  1,  "character", 
  'group',  'g',  1,  "character", 
  'comp',   'C',  1,  "character", 
  'fc',     'f',  1,  "numeric", 
  'pvalue', 'p',  1,  "numeric", 
  'od',     'o',  1,  "character"
), byrow = TRUE, ncol = 4)
opt <- getopt(spec)

print_usage <- function(spec=NULL){
  cat("Usage example: Rscript edgeR.R --count rawdata.txt --group group.txt --od ./
  Options: 
    --help  NULL
    --count   character  gene expression , raw count [forced]
    --group   character  group information [forced]
    --comp    character  A vs B, example: Dex_vs_untreated
    --fc      numeric    fold change cutoff, default: 1.5
    --pvalue  numeric    pvalue cutoff, default:0.05
    --od      character  outdir
  \n")
  q(status=1)
}
if (!is.null(opt$help) |is.null(opt$count)  |is.null(opt$od)) { print_usage(spec) }
if (is.null(opt$group) ) { print_usage(spec) }
if (is.null(opt$fc) )    { opt$fc <- 1.5 }
if (is.null(opt$pvalue) ) { opt$pvalue <- 0.05 }
if (!file.exists(opt$od)) { dir.create(opt$od) }

library(edgeR)
library(org.Hs.eg.db)
library(ggplot2)
library(tidyverse)

# 读取基因表达矩阵信息并查看分组信息和表达矩阵数据
# 读取基因表达矩阵
exprSet <- read.delim(opt$count,row.names = 1,sep = "\t",header = T,check.names = F)
# 检查表达谱
dim(exprSet)
exprSet[1:6,1:6]

group <- read.table(opt$group,header = T,sep = "\t",row.names = 1)
head(group)
group <- group[match(rownames(group),colnames(exprSet)),]

# 处理分组信息
comp <- unlist(strsplit(opt$comp,split = "_vs_"))
group_list <- factor(group,levels = comp)
group_list
table(group_list)


# 构建线性模型。0代表x线性模型的截距为0
design <- model.matrix(~0+group_list)
rownames(design) <- colnames(exprSet)
colnames(design) <- comp
design

# 构建edgeR的DGEList对象
DEG <- DGEList(counts=exprSet,  group=factor(group_list))

# 归一化基因表达分布
DEG <- calcNormFactors(DEG)

# 计算线性模型的参数
DEG <- estimateGLMCommonDisp(DEG,design)
DEG <- estimateGLMTrendedDisp(DEG, design)
DEG <- estimateGLMTagwiseDisp(DEG, design)

# 拟合线性模型
fit <- glmFit(DEG, design)

# 进行差异分析,1,-1意味着前比后
lrt <- glmLRT(fit, contrast=c(1,-1))

# 提取过滤差异分析结果
DEG_edgeR <- as.data.frame(topTags(lrt, n=nrow(DEG)))
head(DEG_edgeR)

# 筛选上下调,设定阈值
fc_cutoff <- opt$fc
pvalue <- opt$pvalue

DEG_edgeR$regulated <- "normal"

loc_up <- intersect(which(DEG_edgeR$logFC>log2(fc_cutoff)),
                    which(DEG_edgeR$PValue<pvalue))
loc_down <- intersect(which(DEG_edgeR$logFC < (-log2(fc_cutoff))),
                      which(DEG_edgeR$PValue<pvalue))

DEG_edgeR$regulated[loc_up] <- "up"
DEG_edgeR$regulated[loc_down] <- "down"

table(DEG_edgeR$regulated)

DEG_edgeR <- cbind(GeneID=rownames(DEG_edgeR),DEG_edgeR)
DEG_edgeR_Sig <- filter(DEG_edgeR,regulated!="normal")

# 保存
setwd(opt$od)
write.table(DEG_edgeR,paste0(opt$comp,"_edgeR_all.xls"), row.names = F, sep="\t",quote = F)
write.table(DEG_edgeR_Sig,paste0(opt$comp,"_edgeR_Sig.xls"), row.names = F, sep="\t",quote = F)