需要的数据:
- 基因表达矩阵:行是基因,列是细胞
- 细胞分组信息:一共一列,行名为细胞,第一列名字为
Cell.type
,代表分组信息
依赖的R包
- GSEAbase
- limma
- dplyr
- tidyr
- GSVA
- pystr (github)
- purrr
- tibble
- pheatmap
代码
library(GSEABase)
library(Biobase)
library(genefilter)
library(limma)
library(RColorBrewer)
library(GSVA)
library(dplyr)
library(pystr)
##这里可以任意切换geneset,只要你有
geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
re_grp<-function(x,s){
purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
}
###该函数x为分组编号,或者是细胞类型,或者是时间序列,cellinfo为细胞分组(行名为细胞,Cell.type为分组)
###num为想要提取的top n基因或者pathway,依赖dplyr,limma
cal_Diff<-function(x,cellInfo,res_es,num){
need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
grouP<-as.factor(need)
desigN<-model.matrix(~grouP+0)
rownames(desigN)<-rownames(cellInfo)
comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
fiT <- lmFit(res_es, desigN)
fiT2 <- contrasts.fit(fiT, comparE)
fiT3 <- eBayes(fiT2)
Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
if (nrow(Diff)>0){
Diff$cluster<-x
Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
}else{
Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
}
}
###该函数mat输入表达矩阵,info输入分组信息(行名为细胞,Cell.type为分组信息),需要预先载入dplyr
Average_express<-function(mat,info){
new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
new_cellinfo<-info
new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
all<-left_join(new_res,new_cellinfo,by="cell")
fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
fuck
}
for(i in c("PF","PO3","PO7","PO11")){
expMat<- read.table(pystr_format("cluster_1.exprMat_{1}.txt",i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
cellInfo<-read.table(pystr_format("cluster_1.cellInfo_{1}.txt",i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
mydata<-as.matrix(expMat)
res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
res_es<-as.data.frame(res_es)
write.table(res_es,pystr_format("./cell_diff/cluster_1_{1}_c3.txt",i),sep="\t",quote=FALSE)
##差异分析
##该函数用来分组,x为一个列表,s为输入的分组信息
##计算差异基因集的合集
#fin<-unique(unlist(lapply(list(0,1,2,3),cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10)))
##这里要明确细胞分类的个数,多了会报错
fin<-do.call('rbind',lapply(list(0,1,2,3),cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
fin<-fin%>%filter(!is.na(cluster))
write.table(fin,pystr_format("./cell_diff/cluster_1_{1}_c3_diff_geneset.txt",i),sep="\t",quote=FALSE,row.names=FALSE)
###计算平均表达量矩阵
heihei2<-Average_express(res_es,cellInfo)
heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
###画热图
pheatmap::pheatmap(houhou2,file=pystr_format("./cell_diff/cluster_1_{1}_c3_diff_heat.pdf",i))
}