Seurat-多样本整合

R版本与运行环境信息

  1. Date:2021-5-6
  2. sessionInfo("Seurat")
  3. R version 4.0.3 (2020-10-10)
  4. Platform: x86_64-w64-mingw32/x64 (64-bit)
  5. Running under: Windows 10 x64 (build 18363)
  6. version: Seurat_4.0.1

资料来源

https://mp.weixin.qq.com/s/dz0dJNf2slhRNd-9bG188g

工作目录设置与查看数据

目的:将会使用3种方法对数据进行合并, 并比较区别

  • 直接将数据读入构建10个数据的10x对象
  • 使用R基础函数merge()进行直接合并
  • 使用Seurat提供的CCA+MNN进行数据整合
  1. > setwd("D:\\Desktop\\s_note\\data\\singel_cell\\IntegrateData")
  2. #共有10组数据,来自Immune Landscape of Viral- and Carcinogen-Driven Head and Neck Cancer,数据集GEO编号:GSE139324
  3. > dir()
  4. [1] "GSM4138110" "GSM4138111" "GSM4138128" "GSM4138129" "GSM4138148" "GSM4138149" "GSM4138162" "GSM4138163"
  5. [9] "GSM4138168" "GSM4138169"
  6. #每组数据均为10x下机数据标准格式
  7. > dir(".\\GSM4138110")
  8. [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"

载入包并读入数据

  1. library(Seurat)
  2. library(tidyverse)
  3. library(patchwork)
  4. library(celldex)
  5. library(SingleR)
  6. #由于配置不太行,所以就只读入三组吧......
  7. #构建文件列表
  8. data.list <- dir() %>% str_c(".\\",.)
  9. data.list <- data.list[2:4]
  10. names(data.list) = c('HNC01TI', 'HNC10PBMC', 'HNC10TIL')
  11. #names(data.list) = c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC',
  12. # 'HNC20TIL', 'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')

使用Read10X()读入数据

  1. counts <- Read10X(data.dir = data.list)

构建seurat对象

  1. #构建对象,并进行简单质控
  2. scRNA<- CreateSeuratObject(counts = counts,project = "TEST2",min.cells = 3,min.features = 200)
  3. > scRNA
  4. An object of class Seurat
  5. 17975 features across 4431 samples within 1 assay
  6. Active assay: RNA (17975 features, 0 variable features)

直接组合数据进行下游分析

数据质控

  1. #进行下游分析
  2. #质控,
  3. #计算每个cell中线粒体基因的含量
  4. scRNA[["mt.percent"]] <- PercentageFeatureSet(scRNA,pattern = "^MT-")
  5. #计算每个cell中红细胞基因的含量
  6. HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
  7. #使用match函数匹配上述HB.gene是否在基因列表中,有则返回基因索引值,没有则返回NA,后利用str_subset()剔除NA将得到的index转换为数值,提取基因名
  8. #str_subset("str",".")表示输出除了NA外所有字符
  9. HB_m <- rownames(scRNA@assays$RNA)[str_subset(match(HB.genes, rownames(scRNA@assays$RNA)),".") %>% as.numeric()]
  10. #HB_m <- <- rownames(scRNA@assays$RNA)[HB.genes %in% rownames(scRNA@assays$RNA)]
  11. #每个cell中的红细胞数量
  12. scRNA[["HB.gene"]] <- PercentageFeatureSet(scRNA,features = HB_m)
  13. #对meta增加project列
  14. scRNA[["project"]] <- rep("Test1",nrow(scRNA@meta.data))
  15. #进行质控,设定质控参数
  16. minGene=500
  17. maxGene=3000
  18. pctMT=10
  19. pcHB=0
  20. scRNA <- subset(scRNA,subset =
  21. nFeature_RNA > minGene &
  22. nFeature_RNA < maxGene &
  23. percent.mt <= pctMT &
  24. percent.HB <= 0)

数据归一化以及高变基因筛选

  1. #数据归一化与标准化
  2. scRNA <- NormalizeData(scRNA,normalization.method = "LogNormalize",scale.factor = 10000)
  3. #高变基因筛选
  4. scRNA <- FindVariableFeatures(scRNA,selection.method = "vst",nfeatures = 2000)
  5. #查看高变基因
  6. plot1 <- VariableFeaturePlot(scRNA)
  7. top10 <- head(VariableFeatures(scRNA),10)
  8. LabelPoints(plot = plot1,points = top10,repel = T)

scRNA-多样本合并整合比较 - 图1

PCA分析

  1. #数据标准化
  2. scRNA <- ScaleData(scRNA,features = VariableFeatures(immune.combine))
  3. #PCA,细胞并没有很好地融合在一起
  4. scRNA <- RunPCA(scRNA)
  5. DimPlot(scRNA,reduction = "pca",group.by = "orig.ident")
  6. #确定后续分析选用的主成分数量
  7. scRNA <- JackStraw(scRNA,num.replicate = 100)
  8. scRNA <- ScoreJackStraw(scRNA,dims = 1:20)
  9. JackStrawPlot(scRNA,dims = 1:20)

scRNA-多样本合并整合比较 - 图2

聚类分析

  1. scRNA <- FindNeighbors(scRNA,dims = 1:30)
  2. scRNA <- FindClusters(scRNA,resolution = 0.5)
  3. table(Idents(scRNA))
  4. #查看聚类情况
  5. > table(Idents(scRNA))
  6. 0 1 2 3 4 5 6 7 8 9 10 11 12 13
  7. 506 485 483 428 416 376 273 268 246 170 167 107 76 40

细胞类型注释

  1. #SingleR的安装
  2. BiocManager::install("SingleR")
  3. #安装数据库
  4. BiocManager::install("celldex")
  5. #下载数据库
  6. refdata <- MonacoImmuneData()
  7. ref <- HumanPrimaryCellAtlasData()
  8. ref <- BlueprintEncodeData()
  9. ref <- MouseRNAseqData()
  10. ref <- ImmGenData()
  11. ref <- DatabaseImmuneCellExpressionData()
  12. ref <- NovershternHematopoieticData()
  1. #载入需要注释的数据
  2. testdata <- GetAssayData(scRNA, slot="data")
  3. #载入数据库
  4. refdata <- MonacoImmuneData()
  5. #提取cluster
  6. clusters <- scRNA@meta.data$seurat_clusters
  7. #细胞类型鉴定
  8. celltype <- SingleR(test = testdata,
  9. ref= refdata,
  10. label =refdata$label.main,
  11. clusters = clusters1,
  12. assay.type.test = "logcounts",
  13. assay.type.ref = "logcounts")
  14. #提取注释信息
  15. cell.name <- data.frame(clusters=rownames(celltype1),cellname=celltype1$labels)
  16. #对cell类型进行注释
  17. #细胞注释
  18. new.cluster.ids <- cell.name
  19. names(new.cluster.ids) <- levels(testdata)
  20. testdata <- RenameIdents(testdata,new.cluster.ids)
  21. #写入metadata
  22. testdata[["cell.type"]] <- Idents(testdata)
  23. #可视化
  24. p1 <- DimPlot(scRNA,reduction = "umap",group.by = "cell.type",label = T)
  25. p2 <- DimPlot(scRNA,reduction = "umap",group.by = "orig.ident",label = T)
  26. p1+p2
  27. #可以看出相同或的cell并没有很好的融合在一起

scRNA-多样本合并整合比较 - 图3

  1. #查看t-sne结果,同样批次效应明显
  2. scRNA <- RunTSNE(scRNA,dims = 1:30)
  3. p3 <- DimPlot(scRNA,reduction = "tsne",group.by = "cell.type",label = T)
  4. p4 <- DimPlot(scRNA,reduction = "tsne",group.by = "orig.ident",label = T)
  5. p3 + p4

scRNA-多样本合并整合比较 - 图4

使用seurat提供的数据整合算法

分割数据并整合

使用Read10x()读入数据后构建seurat对象后使用SplitObject()根据指定的因子分割对象

  1. counts <- Read10X(data.dir = data.list)
  2. scRNA2 <- CreateSeuratObject(counts = counts,project = "TEST2",min.cells = 3,min.features = 200)
  3. scRNA2 <- SplitObject(scRNA2,split.by = "orig.ident")
  4. #返回包含3个seurat对象的列表
  5. > scRNA2
  6. $HNC01TI
  7. An object of class Seurat
  8. 17975 features across 1298 samples within 1 assay
  9. Active assay: RNA (17975 features, 0 variable features)
  10. $HNC10PBMC
  11. An object of class Seurat
  12. 17975 features across 1750 samples within 1 assay
  13. Active assay: RNA (17975 features, 0 variable features)
  14. $HNC10TIL
  15. An object of class Seurat
  16. 17975 features across 1383 samples within 1 assay
  17. Active assay: RNA (17975 features, 0 variable features)
  18. #整合3组数据
  19. #分别对每一组数据标准化,并筛选高变基因
  20. scRNA2 <- lapply(scRNA2, function(obj){
  21. #归一化并分别选取高变基因
  22. obj <- NormalizeData(obj)
  23. obj <- FindVariableFeatures(obj,selection.method = "vst",nfeatures = 2000)
  24. })
  25. #整合数据(详见多样本整合)
  26. features <- SelectIntegrationFeatures(object.list = scRNA2)
  27. immune.anchors <- FindIntegrationAnchors(object.list = scRNA2,anchor.features = features)
  28. immune.combine <- IntegrateData(anchorset = immune.anchors)
  29. #将下游分析改为整合后的原始数据,integrated为整合后的保留2000高变基因的数据
  30. DefaultAssay(immune.combine) <- "RNA"
  31. #查看数据情况
  32. > immune.combine
  33. An object of class Seurat
  34. 19975 features across 4431 samples within 2 assays
  35. Active assay: RNA (17975 features, 0 variable features)
  36. 1 other assay present: integrated

数据质控

  1. counts <- Read10X(data.dir = data.list)
  2. scRNA2 <- CreateSeuratObject(counts = counts,project = "TEST2",min.cells = 3,min.features = 200)
  3. scRNA2[["percent.mt"]] <- PercentageFeatureSet(scRNA2,pattern = "^MT-")
  4. HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
  5. HB_m <- rownames(scRNA2@assays$RNA)[str_subset(match(HB.genes, rownames(scRNA2@assays$RNA)),".") %>% as.numeric()]
  6. scRNA2[["percent.HB"]] <- PercentageFeatureSet(scRNA2,features = HB_m)
  7. minGene=500
  8. maxGene=3000
  9. pctMT=10
  10. pcHB=0
  11. immune.combine <- subset(scRNA2,subset =
  12. nFeature_RNA > minGene &
  13. nFeature_RNA < maxGene &
  14. percent.mt <= pctMT &
  15. percent.HB <= 0)

归一化与高变基因筛选

  1. scRNA2 <- SplitObject(scRNA2,split.by = "orig.ident")
  2. scRNA2 <- lapply(scRNA2, function(obj){
  3. #归一化并分别选取高变基因
  4. obj <- NormalizeData(obj)
  5. obj <- FindVariableFeatures(obj,selection.method = "vst",nfeatures = 2000)
  6. })

数据整合

  1. features <- SelectIntegrationFeatures(object.list = scRNA2)
  2. immune.anchors <- FindIntegrationAnchors(object.list = scRNA2,anchor.features = features)
  3. immune.combine <- IntegrateData(anchorset = immune.anchors)

数据分析

  1. immune.combine <- ScaleData(immune.combine, verbose = T)
  2. immune.combine <- RunPCA(immune.combine, npcs = 30, verbose = T)
  3. immune.combine <- RunUMAP(immune.combine, reduction = "pca", dims = 1:30)
  4. immune.combine <- FindNeighbors(immune.combine, reduction = "pca", dims = 1:30)
  5. immune.combine <- FindClusters(immune.combine, resolution = 0.5)
  6. immune.combine <- RunTSNE(immune.combine,dims = 1:30)
  7. #注释
  8. testdata <- GetAssayData(immune.combine, slot="data")
  9. #载入数据库
  10. refdata <- MonacoImmuneData()
  11. #提取cluster
  12. clusters <- immune.combine@meta.data$seurat_clusters
  13. #细胞类型鉴定
  14. celltype <- SingleR(test = testdata,
  15. ref= refdata,
  16. label =refdata$label.main,
  17. clusters = clusters,
  18. assay.type.test = "logcounts",
  19. assay.type.ref = "logcounts")
  20. #提取注释信息
  21. cell.name <- data.frame(clusters=rownames(celltype),cellname=celltype$labels)
  22. #细胞注释
  23. new.cluster.ids <- cell.name[,2]
  24. names(new.cluster.ids) <- levels(pbmc)
  25. pbmc <- RenameIdents(pbmc,new.cluster.ids)
  26. #注释信息写入meta.data
  27. pbmc$cell.type <- Idents(pbmc)
  28. #将信息写入meta.data
  29. immune.combine[["cell.type"]] <- clusters
  30. #可视化
  31. p1 <- DimPlot(immune.combine, reduction = "umap", group.by = "orig.ident",label = T)
  32. p2 <- DimPlot(immune.combine, reduction = "umap", label = TRUE, repel = TRUE,group.by = "cell.type")
  33. p1 + p2

scRNA-多样本合并整合比较 - 图5

  1. p3 <- DimPlot(immune.combine, reduction = "tsne", group.by = "orig.ident",label = T)
  2. p4 <- DimPlot(immune.combine, reduction = "tsne", label = TRUE, repel = TRUE,group.by = "cell.type")
  3. p3 + p4

scRNA-多样本合并整合比较 - 图6