1.简介:细化的差异分析
#拟时序分析-简介;细化的差异分析rm(list = ls())library(Seurat)# devtools::install_github('satijalab/seurat-data')library(SeuratData)library(ggplot2)library(patchwork)library(dplyr)load(file = '../section-01-cluster/basic.sce.pbmc.Rdata')pbmcDimPlot(pbmc,reduction = 'umap',label = TRUE, pt.size = 0.5) + NoLegend()ggsave('DimPlot-umap.pdf',units = 'cm',height = 8,width = 16)sce=pbmctable( Idents(sce ))table(sce@meta.data$seurat_clusters) #8个亚群table(sce@meta.data$orig.ident)#取Mono亚群levels(Idents(sce))sce = sce[, Idents(sce) %in% c("FCGR3A+ Mono","CD14+ Mono")] # CD16scelevels(Idents(sce))#亚群"CD14+ Mono"和"FCGR3A+ Mono"的差异分析结果markers_df <- FindMarkers(object = sce,ident.1 = 'FCGR3A+ Mono',ident.2 = 'CD14+ Mono',#logfc.threshold = 0,min.pct = 0.25)head(markers_df)#abs(markers_df$avg_log2FC) > 1 挑选cg_markers_df=markers_df[abs(markers_df$avg_log2FC) > 1,]dim(cg_markers_df)#排序cg_markers_df=cg_markers_df[order(cg_markers_df$avg_log2FC),]DotPlot(sce,features = rownames(cg_markers_df))+ theme(axis.text.x = element_text(angle = 45,vjust = 0.5,hjust=0.5))ggsave('DotPlot-cg_markers_df-monocyte.pdf')DoHeatmap(sce,features = rownames(cg_markers_df))ggsave('DoHeatmap-cg_markers_df-monocyte.pdf')save(sce,file = 'sce-monocyte.Rdata')
2.拟时序分析-构建对象
#拟时序分析-构建对象
rm(list = ls())
library(Seurat)
load(file = 'sce-monocyte.Rdata')
sce #挑选出的mono(642 samples)
#因为 monocle包是使用CellDataSet对象,所以重新构建
#在monocle包学习那一部分有细讲
library(monocle)
sample_ann <- sce@meta.data
sample_ann$celltype=Idents(sce)
head(sample_ann)
gene_ann <- data.frame(
gene_short_name = rownames(sce@assays$RNA) ,
row.names = rownames(sce@assays$RNA)
)
head(gene_ann)
pd <- new("AnnotatedDataFrame",
data=sample_ann)
fd <- new("AnnotatedDataFrame",
data=gene_ann)
ct=as.data.frame(sce@assays$RNA@counts)
ct[1:4,1:4]
sc_cds <- newCellDataSet(
as.matrix(ct), #表达矩阵
phenoData = pd,#细胞的表型
featureData =fd,#基因表型
expressionFamily = negbinomial.size(),
lowerDetectionLimit=1)
sc_cds #将Seurat对象转化成了monocle对象
#是monocle的官网标准流程(我们要做的仅仅是修改阈值即可)
library(monocle)
sc_cds
sc_cds <- detectGenes(sc_cds, min_expr = 1) #检测高于最小阈值的基因
#数值可以自行摸索
sc_cds <- sc_cds[fData(sc_cds)$num_cells_expressed > 10, ]
sc_cds
cds <- sc_cds
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)
#并不是所有的基因都有作用,所以先进行挑选,合适的基因用来进行聚类。
disp_table <- dispersionTable(cds) #检索指定均值-方差关系的值表
unsup_clustering_genes <- subset(disp_table,
mean_expression >= 0.1)
unsup_clustering_genes
cds <- setOrderingFilter(cds,
unsup_clustering_genes$gene_id) #标记聚类基因
plot_ordering_genes(cds) #通过平均值与离散度来绘制基因图,突出那些被选中进行排序的基因
plot_pc_variance_explained(cds,
return_all = F) # norm_method='log'
#其中 num_dim 参数选择基于上面的PCA图
#计算CellDataSet对象到低维空间的投影
cds <- reduceDimension(cds,
max_components = 2,
num_dim = 6,
reduction_method = 'tSNE',
verbose = T)
cds <- clusterCells(cds, num_clusters = 6) #人为设置6,没有规定,自己随意设置
plot_cell_clusters(cds, 1, 2 ) #人为分5群
table(pData(cds)$Cluster)
colnames(pData(cds))
table(pData(cds)$seurat_clusters)
table(pData(cds)$Cluster,pData(cds)$seurat_clusters)
table(pData(cds)$Cluster,pData(cds)$celltype)
plot_cell_clusters(cds, 1, 2 )
#可以看到 monocle 给细胞重新定义了亚群,亚群数量是自己选择的
#整体来说,monocle和seurat各自独立流程定义的亚群的一致性还不错
#只是跑流程而已
save(cds,file = 'input_cds.Rdata')
#构建对象,seurat,monocle,scater
#monocle标准流程,降维聚类分群

#拟时序分析-monocle结果解读
rm(list = ls())
library(monocle)
load('input_cds.Rdata')
##查看monocle
cds
#接下来很重要,到底是看哪个性状的轨迹
table(pData(cds)$Cluster)
table(pData(cds)$Cluster,pData(cds)$celltype)
plot_cell_clusters(cds, 1, 2 )
##我们这里并不能使用 monocle的分群
#还是依据前面的 seurat分群, 也就是说前面的代码仅仅是流程而已,我们没有使用那些结果哦
#其实取决于自己真实的生物学意图
pData(cds)$Cluster=pData(cds)$celltype
table(pData(cds)$Cluster)
Sys.time()
#检测基因差异表达
diff_test_res <- differentialGeneTest(cds,
fullModelFormulaStr = "~Cluster")
Sys.time()
#Select genes that are significant at an FDR < 10%
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes=sig_genes[order(sig_genes$pval),]
head(sig_genes[,c("gene_short_name", "pval", "qval")] )
cg=as.character(head(sig_genes$gene_short_name))
#挑选差异最显著的基因可视化
plot_genes_jitter(cds[cg,],
grouping = "Cluster",
color_by = "Cluster",
nrow= 3,
ncol = NULL )
cg2=as.character(tail(sig_genes$gene_short_name))
plot_genes_jitter(cds[cg2,],
grouping = "Cluster",
color_by = "Cluster",
nrow= 3,
ncol = NULL )
#前面是找差异基因,后面是做拟时序分析
#1.挑选合适的基因. (有多个方法,例如提供已知的基因集)
#这里选取统计学显著的差异基因列表
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
ordering_genes
cds <- setOrderingFilter(cds, ordering_genes)
plot_ordering_genes(cds)
#2.降维
cds <- reduceDimension(cds,
max_components = 2,
method = 'DDRTree') #默认"DDRTree"
#3.对细胞进行排序
cds <- orderCells(cds)
#4.可视化
plot_cell_trajectory(cds, color_by = "Cluster")
ggsave('monocle_cell_trajectory_for_seurat.pdf')
length(cg)
plot_genes_in_pseudotime(cds[cg,],
color_by = "Cluster")
ggsave('monocle_plot_genes_in_pseudotime_for_seurat.pdf')
phe=pData(cds)
boxplot(phe$Pseudotime,phe$Cluster)



#前面根据差异基因,推断好了拟时序,也就是说把差异基因动态化了
#后面就可以具体推断哪些基因随着拟时序如何的变化
my_cds_subset=cds
# pseudotime is now a column in the phenotypic data as well as the cell state
head(pData(my_cds_subset))
# 这个differentialGeneTest会比较耗费时间
my_pseudotime_de <- differentialGeneTest(my_cds_subset,
fullModelFormulaStr = "~sm.ns(Pseudotime)",
cores = 4 ) #调用四个线程
head(my_pseudotime_de)
save( my_cds_subset,my_pseudotime_de,
file = 'output_of_monocle.Rdata')

图 my_pseudotime_de
3.精雕细琢
##后面是对前面的结果进行精雕细琢
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(gplots)
library(ggplot2)
library(monocle)
library(dplyr)
load(file = 'output_of_monocle.Rdata')
cds=my_cds_subset
phe=pData(cds)
colnames(phe)
library(ggsci)
#Cluster分群
p1=plot_cell_trajectory(cds, color_by = "Cluster") + scale_color_nejm()
p1
ggsave('trajectory_by_cluster.pdf')
plot_cell_trajectory(cds, color_by = "celltype")
#时序的起始点
p2=plot_cell_trajectory(cds, color_by = "Pseudotime")
p2
ggsave('trajectory_by_Pseudotime.pdf')
#State状态
p3=plot_cell_trajectory(cds, color_by = "State") + scale_color_npg()
p3
ggsave('trajectory_by_State.pdf')
#组合
library(patchwork)
p1+p2/p3

图 组合图
library(dplyr)
my_pseudotime_de %>% arrange(qval) %>% head()
# save the top 6 genes
my_pseudotime_de %>% arrange(qval) %>% head() %>% select(gene_short_name) -> my_pseudotime_gene
my_pseudotime_gene=my_pseudotime_gene[,1]
my_pseudotime_gene
plot_genes_in_pseudotime(my_cds_subset[my_pseudotime_gene,])+ scale_color_npg()
ggsave('monocle_top6_pseudotime_by_state.pdf')
plot_genes_jitter(my_cds_subset[my_pseudotime_gene,],
grouping = "Cluster",
color_by = "Cluster",
nrow= 3,
ncol = NULL )+ scale_color_nejm()
ggsave('monocle_top6_pseudotime_by_cluster.pdf')
# cluster the top 50 genes that vary as a function of pseudotime
my_pseudotime_de %>% arrange(qval) %>% head(50) %>% select(gene_short_name) -> gene_to_cluster
gene_to_cluster <- gene_to_cluster[,1]
gene_to_cluster
colnames(pData(my_cds_subset))
table(pData(my_cds_subset)$Cluster,pData(my_cds_subset)$State)
ac=pData(my_cds_subset)[c('celltype','State','Pseudotime')]
head(ac)
#这个热图绘制的并不是纯粹的细胞基因表达量矩阵,而是被 Pseudotime 好了的100列,50行的矩阵
my_pseudotime_cluster <- plot_pseudotime_heatmap(my_cds_subset[gene_to_cluster,],
# num_clusters = 2,
# add_annotation_col = ac,
show_rownames = TRUE,
return_heatmap = TRUE)
pdf('monocle_top50_heatmap.pdf')
dev.off()

