单细胞.png

    1. rm(list=ls())
    2. library(dplyr)
    3. library(Seurat)
    4. library(patchwork)
    5. ## =============1.Load the PBMC dataset
    6. data_dir <- "data/pbmc3k_filtered_gene_bc_matrices/"
    7. pbmc.data <- Read10X(data.dir = data_dir)
    8. pbmc.data
    9. # Initialize the Seurat object with the raw (non-normalized data)
    10. # min.cell:每个feature至少在多少个细胞中表达
    11. # min.features:每个细胞中至少有多少个feature被检测到
    12. pbmc <- CreateSeuratObject(counts = pbmc.data,
    13. project = "pbmc3k",
    14. min.cells = 3,
    15. min.features = 200)
    16. pbmc
    17. ## =============2.探索一下数据
    18. # count matrix长什么样?
    19. # 首先看看三个基因的count值,'.'values in the matrix represent 0s (no molecules detected)
    20. pbmc.data[1:30, 1:30]
    21. pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
    22. rownames(pbmc.data)
    23. colnames(pbmc.data)
    24. # 0值矩阵大小
    25. raw_counts <- as.matrix(pbmc.data)
    26. raw_counts[1:4,1:4]
    27. dense.size <- object.size(raw_counts)
    28. dense.size
    29. # 稀疏矩阵大小
    30. sparse.size <- object.size(pbmc.data)
    31. sparse.size
    32. # 压缩比
    33. dense.size/sparse.size
    34. ## =============3.QC:
    35. # 1.每个细胞中RNA的个数;
    36. # 2,每个细胞中线粒体基因表达的比例
    37. # The [[ operator can add columns to object metadata. This is a great place to stash QC stats
    38. grep(pattern = "^MT-", rownames(pbmc), value = T)
    39. pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
    40. # QC指标
    41. head(pbmc@meta.data, 5)
    42. # QC指标使用小提琴图可视化
    43. VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
    44. ncol = 3,pt.size = 0.1)
    45. # 两个指标之间的相关性
    46. plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA",
    47. feature2 = "percent.mt")
    48. plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA",
    49. feature2 = "nFeature_RNA")
    50. plot1 + plot2
    51. # 过滤
    52. pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
    53. pbmc
    54. ## =============4.标准化
    55. # LogNormalize
    56. pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize",
    57. scale.factor = 10000)
    58. # 标准化后的值保存在:pbmc[["RNA"]]@data
    59. normalized.data <- pbmc[["RNA"]]@data
    60. normalized.data[1:20,1:4]
    61. dim(normalized.data)
    62. ## =============5.鉴定高变基因
    63. # 高变基因:在一些细胞中表达高,另一些细胞中表达低的基因
    64. # 变异指标: mean-variance relationship
    65. # 默认返回两千个高变基因,用于下游如PCA降维分析。
    66. pbmc <- FindVariableFeatures(pbmc, selection.method = "vst",
    67. nfeatures = 2000)
    68. # 提取前10的高变基因
    69. top10 <- head(VariableFeatures(pbmc), 10)
    70. top10
    71. # 展示高变基因
    72. plot1 <- VariableFeaturePlot(pbmc)
    73. plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
    74. plot1
    75. plot2
    76. ## =============6.Scaling the data
    77. # 归一化处理:每一个基因在所有细胞中的均值变为0,方差标为1,对于降维来说是必需步骤
    78. # 归一化后的值保存在:pbmc[["RNA"]]@scale.data
    79. pbmc <- ScaleData(pbmc)
    80. scale.data <- pbmc[["RNA"]]@scale.data
    81. dim(scale.data)
    82. scale.data[1:10,1:4]
    83. # 后面绘制热图使用的是归一化后的值,可以选择全部基因归一化,但是耗时久一点
    84. all.genes <- rownames(pbmc)
    85. pbmc <- ScaleData(pbmc, features = all.genes)
    86. ## =============7.降维
    87. # PCA降维,默认使用前面2000个高变基因,可以使用features改变用于降维的基因集
    88. pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
    89. # 可视化
    90. VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
    91. DimPlot(pbmc, reduction = "pca")
    92. DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
    93. DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
    94. ## =============8.确定使用PC个数
    95. # # each PC essentially representing a ‘metafeature’
    96. # 方式1:根据p值判断,比较耗时
    97. # pbmc <- JackStraw(pbmc, num.replicate = 100)
    98. # pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
    99. #
    100. # JackStrawPlot(pbmc, dims = 1:15)
    101. # 方式2:肘部法,根据拐点判断,比较主观,尽量多选几个PC
    102. ElbowPlot(pbmc, ndims = 30)
    103. ## =============9.对细胞聚类
    104. # 选取10个PC,首先基于PCA空间构建一个基于欧氏距离的KNN图
    105. pbmc <- FindNeighbors(pbmc, dims = 1:10)
    106. # 聚类并最优化
    107. # resolution参数:值越大,细胞分群数越多,
    108. # 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells
    109. # Optimal resolution often increases for larger datasets.
    110. pbmc <- FindClusters(pbmc, resolution = 0.5)
    111. # 查看聚类数ID
    112. head(Idents(pbmc), 5)
    113. # 查看每个类别多少个细胞
    114. head(pbmc@meta.data)
    115. table(pbmc@meta.data$seurat_clusters)
    116. # resolution对聚类的影响
    117. res.used <- seq(0.1,1,by=0.2)
    118. res.used
    119. for(i in res.used){
    120. pbmc <- FindClusters(pbmc, verbose = T, resolution = res.used)
    121. }
    122. # 可视化
    123. library(clustree)
    124. clus.tree.out <- clustree(pbmc) +
    125. theme(legend.position = "bottom") +
    126. scale_color_brewer(palette = "Set1") +
    127. scale_edge_color_continuous(low = "grey80", high = "red")
    128. clus.tree.out
    129. pbmc <- FindClusters(pbmc, verbose = T, resolution = 0.5)
    130. ## =============10.将细胞在低维空间可视化UMAP/tSNE
    131. set.seed(123)
    132. pbmc <- RunUMAP(pbmc, dims = 1:10)
    133. pbmc <- RunTSNE(pbmc, dims = 1:10)
    134. # 可视化
    135. DimPlot(pbmc, reduction = "umap", label = T, label.size = 5)
    136. DimPlot(pbmc, reduction = "tsne", label = T, label.size = 5)
    137. saveRDS(pbmc, file = "data/pbmc_tutorial.rds")
    138. #pbmc <- readRDS("data/pbmc_tutorial.rds")
    139. ## =============11.差异表达分析
    140. # 在cluster2 vs else中差异表达
    141. cluster1.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
    142. head(cluster1.markers, n = 5)
    143. # 指定两个类cluster 5 from clusters 0 and 3
    144. cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
    145. head(cluster5.markers, n = 5)
    146. # 所有类的差异表达基因
    147. # only.pos:只保留上调差异表达的基因
    148. pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    149. write.csv(pbmc.markers,file = "data/pbmc.markers.csv")
    150. head(pbmc.markers)
    151. # 筛选
    152. pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
    153. # 选择一些基因进行可视化,作者这里根据自己的知识背景选择的相关基因
    154. VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
    155. VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
    156. FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"))
    157. DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) + coord_flip() +
    158. theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
    159. top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
    160. DoHeatmap(pbmc, features = top10$gene) + NoLegend()
    161. ## =============12.细胞类型鉴定:使用经典的markers进行注释
    162. new.cluster.ids <- c("Naive CD4 T", # IL7R, CCR7
    163. "CD14+ Mono", # CD14, LYZ
    164. "Memory CD4 T", # IL7R, S100A4
    165. "B", # MS4A1
    166. "CD8 T", # CD8A
    167. "FCGR3A+ Mono", # FCGR3A, MS4A7
    168. "NK", # GNLY, NKG7
    169. "DC", # FCER1A, CST3
    170. "Platelet") # PPBP
    171. names(new.cluster.ids) <- levels(pbmc)
    172. new.cluster.ids
    173. # 原来的细胞聚类名称
    174. Idents(pbmc)
    175. # 更改细胞聚类的名字
    176. pbmc <- RenameIdents(pbmc, new.cluster.ids)
    177. Idents(pbmc)
    178. # 可视化
    179. DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 2,label.box = T) + NoLegend()
    180. DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 2,label.box = T) + NoLegend()
    181. # 保存结果
    182. pbmc@meta.data$cell_anno <- Idents(pbmc)
    183. head(pbmc@meta.data)
    184. write.csv(pbmc@meta.data,file = "data/metadata.csv")
    185. saveRDS(pbmc, file = "data/pbmc3k_final.rds")
    186. table(pbmc@meta.data$cell_anno)
    187. ## 去掉画图部分的流程总结
    188. data_dir <- "data/pbmc3k_filtered_gene_bc_matrices/"
    189. pbmc.counts <- Read10X(data.dir = data_dir)
    190. pbmc <- CreateSeuratObject(counts = pbmc.counts)
    191. pbmc <- NormalizeData(object = pbmc)
    192. pbmc <- FindVariableFeatures(object = pbmc)
    193. pbmc <- ScaleData(object = pbmc)
    194. pbmc <- RunPCA(object = pbmc)
    195. pbmc <- FindNeighbors(object = pbmc)
    196. pbmc <- FindClusters(object = pbmc)
    197. pbmc <- RunTSNE(object = pbmc)
    198. pbmc <- RunUMAP(object = pbmc)
    199. DimPlot(object = pbmc, reduction = "tsne")

    代码及图片均来自于生信技能树张娟老师