rm(list = ls())
load(file = "step2output.Rdata")
#差异分析,用limma包来做
#需要表达矩阵和Group,不需要改
library(limma)
design=model.matrix(~Group)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
#为deg数据框添加几列
#1.加probe_id列,把行名变成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加上探针注释
ids = ids[!duplicated(ids$symbol),]
加上基因名
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
nrow(deg)
加质控列,设置阈值
#3.加change列,标记上下调基因
logFC_t=1 ##阈值
P.Value_t = 0.05
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
deg <- mutate(deg,change = ifelse(k1,"down",
ifelse(k2,"up","stable")))
table(deg$change)
加ENTREZID列,用于富集分析
#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类,存放人类各种不同基因ID的包
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
dim(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL")) ##列名不一样时必须这样写
dim(deg)
length(unique(deg$symbol)) ##查看有无重复
save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")