rm(list = ls())load(file = "step2output.Rdata")#差异分析,用limma包来做#需要表达矩阵和Group,不需要改library(limma)design=model.matrix(~Group)fit=lmFit(exp,design)fit=eBayes(fit)deg=topTable(fit,coef=2,number = Inf)
#为deg数据框添加几列#1.加probe_id列,把行名变成一列library(dplyr)deg <- mutate(deg,probe_id=rownames(deg))head(deg)
#2.加上探针注释ids = ids[!duplicated(ids$symbol),]
加上基因名
deg <- inner_join(deg,ids,by="probe_id")head(deg)nrow(deg)
加质控列,设置阈值
#3.加change列,标记上下调基因logFC_t=1 ##阈值P.Value_t = 0.05k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)deg <- mutate(deg,change = ifelse(k1,"down",ifelse(k2,"up","stable")))table(deg$change)
加ENTREZID列,用于富集分析
#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)library(clusterProfiler)library(org.Hs.eg.db)s2e <- bitr(deg$symbol,fromType = "SYMBOL",toType = "ENTREZID",OrgDb = org.Hs.eg.db)#人类,存放人类各种不同基因ID的包#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDbdim(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL")) ##列名不一样时必须这样写dim(deg)length(unique(deg$symbol)) ##查看有无重复save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")
