1.简介:细化的差异分析

  1. #拟时序分析-简介;细化的差异分析
  2. rm(list = ls())
  3. library(Seurat)
  4. # devtools::install_github('satijalab/seurat-data')
  5. library(SeuratData)
  6. library(ggplot2)
  7. library(patchwork)
  8. library(dplyr)
  9. load(file = '../section-01-cluster/basic.sce.pbmc.Rdata')
  10. pbmc
  11. DimPlot(pbmc,
  12. reduction = 'umap',
  13. label = TRUE, pt.size = 0.5) + NoLegend()
  14. ggsave('DimPlot-umap.pdf',
  15. units = 'cm',
  16. height = 8,
  17. width = 16)
  18. sce=pbmc
  19. table( Idents(sce ))
  20. table(sce@meta.data$seurat_clusters) #8个亚群
  21. table(sce@meta.data$orig.ident)
  22. #取Mono亚群
  23. levels(Idents(sce))
  24. sce = sce[, Idents(sce) %in% c("FCGR3A+ Mono","CD14+ Mono")] # CD16
  25. sce
  26. levels(Idents(sce))
  27. #亚群"CD14+ Mono"和"FCGR3A+ Mono"的差异分析结果
  28. markers_df <- FindMarkers(object = sce,
  29. ident.1 = 'FCGR3A+ Mono',
  30. ident.2 = 'CD14+ Mono',
  31. #logfc.threshold = 0,
  32. min.pct = 0.25)
  33. head(markers_df)
  34. #abs(markers_df$avg_log2FC) > 1 挑选
  35. cg_markers_df=markers_df[abs(markers_df$avg_log2FC) > 1,]
  36. dim(cg_markers_df)
  37. #排序
  38. cg_markers_df=cg_markers_df[order(cg_markers_df$avg_log2FC),]
  39. DotPlot(sce,
  40. features = rownames(cg_markers_df))+ theme(axis.text.x = element_text(angle = 45,vjust = 0.5,hjust=0.5))
  41. ggsave('DotPlot-cg_markers_df-monocyte.pdf')
  42. DoHeatmap(sce,
  43. features = rownames(cg_markers_df))
  44. ggsave('DoHeatmap-cg_markers_df-monocyte.pdf')
  45. save(sce,file = 'sce-monocyte.Rdata')

image.pngimage.png

2.拟时序分析-构建对象

#拟时序分析-构建对象
rm(list = ls())
library(Seurat)
load(file = 'sce-monocyte.Rdata')
sce #挑选出的mono(642 samples)

#因为 monocle包是使用CellDataSet对象,所以重新构建
#在monocle包学习那一部分有细讲
library(monocle)
sample_ann <- sce@meta.data  
sample_ann$celltype=Idents(sce)
head(sample_ann)
gene_ann <- data.frame(
  gene_short_name = rownames(sce@assays$RNA) , 
  row.names =  rownames(sce@assays$RNA) 
)
head(gene_ann)

pd <- new("AnnotatedDataFrame",
          data=sample_ann)
fd <- new("AnnotatedDataFrame",
          data=gene_ann)
ct=as.data.frame(sce@assays$RNA@counts)
ct[1:4,1:4]

sc_cds <- newCellDataSet(
  as.matrix(ct), #表达矩阵
  phenoData = pd,#细胞的表型
  featureData =fd,#基因表型
  expressionFamily = negbinomial.size(),
  lowerDetectionLimit=1)

sc_cds #将Seurat对象转化成了monocle对象


#是monocle的官网标准流程(我们要做的仅仅是修改阈值即可)
library(monocle)
sc_cds
sc_cds <- detectGenes(sc_cds, min_expr = 1) #检测高于最小阈值的基因

#数值可以自行摸索
sc_cds <- sc_cds[fData(sc_cds)$num_cells_expressed > 10, ]
sc_cds

cds <- sc_cds
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds) 

#并不是所有的基因都有作用,所以先进行挑选,合适的基因用来进行聚类。
disp_table <- dispersionTable(cds) #检索指定均值-方差关系的值表
unsup_clustering_genes <- subset(disp_table,
                                 mean_expression >= 0.1)
unsup_clustering_genes

cds <- setOrderingFilter(cds, 
                         unsup_clustering_genes$gene_id) #标记聚类基因
plot_ordering_genes(cds) #通过平均值与离散度来绘制基因图,突出那些被选中进行排序的基因 
plot_pc_variance_explained(cds, 
                           return_all = F) # norm_method='log'

#其中 num_dim 参数选择基于上面的PCA图
#计算CellDataSet对象到低维空间的投影
cds <- reduceDimension(cds, 
                       max_components = 2, 
                       num_dim = 6,
                       reduction_method = 'tSNE',
                       verbose = T)

cds <- clusterCells(cds, num_clusters = 6) #人为设置6,没有规定,自己随意设置
plot_cell_clusters(cds, 1, 2 ) #人为分5群

table(pData(cds)$Cluster) 
colnames(pData(cds)) 
table(pData(cds)$seurat_clusters)
table(pData(cds)$Cluster,pData(cds)$seurat_clusters)
table(pData(cds)$Cluster,pData(cds)$celltype)
plot_cell_clusters(cds, 1, 2 )

#可以看到 monocle 给细胞重新定义了亚群,亚群数量是自己选择的
#整体来说,monocle和seurat各自独立流程定义的亚群的一致性还不错
#只是跑流程而已
save(cds,file = 'input_cds.Rdata')

#构建对象,seurat,monocle,scater
#monocle标准流程,降维聚类分群

image.png

#拟时序分析-monocle结果解读
rm(list = ls()) 
library(monocle)

load('input_cds.Rdata')
##查看monocle
cds 
#接下来很重要,到底是看哪个性状的轨迹
table(pData(cds)$Cluster)
table(pData(cds)$Cluster,pData(cds)$celltype)
plot_cell_clusters(cds, 1, 2 )

##我们这里并不能使用 monocle的分群
#还是依据前面的 seurat分群, 也就是说前面的代码仅仅是流程而已,我们没有使用那些结果哦

#其实取决于自己真实的生物学意图
pData(cds)$Cluster=pData(cds)$celltype
table(pData(cds)$Cluster)

Sys.time()
#检测基因差异表达
diff_test_res <- differentialGeneTest(cds,
                                      fullModelFormulaStr = "~Cluster")
Sys.time()

#Select genes that are significant at an FDR < 10%
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes=sig_genes[order(sig_genes$pval),]
head(sig_genes[,c("gene_short_name", "pval", "qval")] ) 
cg=as.character(head(sig_genes$gene_short_name)) 

#挑选差异最显著的基因可视化
plot_genes_jitter(cds[cg,],
                  grouping = "Cluster",
                  color_by = "Cluster",
                  nrow= 3,
                  ncol = NULL )
cg2=as.character(tail(sig_genes$gene_short_name)) 

plot_genes_jitter(cds[cg2,],
                  grouping = "Cluster",
                  color_by = "Cluster",
                  nrow= 3,
                  ncol = NULL )

#前面是找差异基因,后面是做拟时序分析

#1.挑选合适的基因. (有多个方法,例如提供已知的基因集)
#这里选取统计学显著的差异基因列表
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
ordering_genes
cds <- setOrderingFilter(cds, ordering_genes)
plot_ordering_genes(cds) 

#2.降维
cds <- reduceDimension(cds, 
                       max_components = 2,
                       method = 'DDRTree') #默认"DDRTree"
#3.对细胞进行排序
cds <- orderCells(cds)

#4.可视化
plot_cell_trajectory(cds, color_by = "Cluster")  
ggsave('monocle_cell_trajectory_for_seurat.pdf')

length(cg)
plot_genes_in_pseudotime(cds[cg,],
                         color_by = "Cluster") 
ggsave('monocle_plot_genes_in_pseudotime_for_seurat.pdf')

phe=pData(cds)
boxplot(phe$Pseudotime,phe$Cluster)

image.png
image.png
image.png

#前面根据差异基因,推断好了拟时序,也就是说把差异基因动态化了
#后面就可以具体推断哪些基因随着拟时序如何的变化
my_cds_subset=cds
# pseudotime is now a column in the phenotypic data as well as the cell state
head(pData(my_cds_subset))
# 这个differentialGeneTest会比较耗费时间
my_pseudotime_de <- differentialGeneTest(my_cds_subset,
                                         fullModelFormulaStr = "~sm.ns(Pseudotime)",
                                         cores = 4 ) #调用四个线程
head(my_pseudotime_de)
save( my_cds_subset,my_pseudotime_de,
      file = 'output_of_monocle.Rdata')

image.png
图 my_pseudotime_de

3.精雕细琢

##后面是对前面的结果进行精雕细琢
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(gplots)
library(ggplot2)
library(monocle)
library(dplyr)
load(file = 'output_of_monocle.Rdata')

cds=my_cds_subset
phe=pData(cds)
colnames(phe)

library(ggsci)

#Cluster分群
p1=plot_cell_trajectory(cds, color_by = "Cluster")  + scale_color_nejm() 
p1
ggsave('trajectory_by_cluster.pdf')
plot_cell_trajectory(cds, color_by = "celltype")  

#时序的起始点
p2=plot_cell_trajectory(cds, color_by = "Pseudotime")  
p2
ggsave('trajectory_by_Pseudotime.pdf')

#State状态
p3=plot_cell_trajectory(cds, color_by = "State")  + scale_color_npg()
p3
ggsave('trajectory_by_State.pdf')

#组合
library(patchwork)
p1+p2/p3

image.png
图 组合图

library(dplyr)
my_pseudotime_de %>% arrange(qval) %>% head() 
# save the top 6 genes
my_pseudotime_de %>% arrange(qval) %>% head() %>% select(gene_short_name) -> my_pseudotime_gene
my_pseudotime_gene=my_pseudotime_gene[,1]
my_pseudotime_gene
plot_genes_in_pseudotime(my_cds_subset[my_pseudotime_gene,])+ scale_color_npg()
ggsave('monocle_top6_pseudotime_by_state.pdf')

plot_genes_jitter(my_cds_subset[my_pseudotime_gene,],
                  grouping = "Cluster",
                  color_by = "Cluster",
                  nrow= 3,
                  ncol = NULL )+ scale_color_nejm()
ggsave('monocle_top6_pseudotime_by_cluster.pdf')


# cluster the top 50 genes that vary as a function of pseudotime
my_pseudotime_de %>% arrange(qval) %>% head(50) %>% select(gene_short_name) -> gene_to_cluster
gene_to_cluster <- gene_to_cluster[,1]
gene_to_cluster
colnames(pData(my_cds_subset))
table(pData(my_cds_subset)$Cluster,pData(my_cds_subset)$State) 
ac=pData(my_cds_subset)[c('celltype','State','Pseudotime')]
head(ac)

#这个热图绘制的并不是纯粹的细胞基因表达量矩阵,而是被 Pseudotime 好了的100列,50行的矩阵
my_pseudotime_cluster <- plot_pseudotime_heatmap(my_cds_subset[gene_to_cluster,],
                                                 # num_clusters = 2, 
                                                 # add_annotation_col = ac,
                                                 show_rownames = TRUE,
                                                 return_heatmap = TRUE)

pdf('monocle_top50_heatmap.pdf')
dev.off()