在本教程中,我们将学习使用Signac包对多样本的scATAC-seq数据进行整合分析。这里,我们对来自10x Genomics和sci-ATAC-seq技术测序的成年小鼠大脑的多个单细胞ATAC-seq数据集进行了整合分析。

其中,10x Genomics平台产生的原始数据可从官网下载:

sci-ATAC-seq技术产生的数据集由Cusanovich和Hill等人生成。原始数据可从作者的网站下载:

我们将演示使用Seurat v3中的数据整合方法(dataset integration and label transfer)对多个scATAC-seq数据集进行整合分析,以及使用harmony包进行数据整合。

安装并加载所需的R包

  1. if (!requireNamespace("BiocManager", quietly = TRUE))
  2. install.packages("BiocManager")
  3. BiocManager::install("rtracklayer")
  4. library(devtools)
  5. install_github("immunogenomics/harmony")
  6. library(Signac)
  7. library(Seurat)
  8. library(patchwork)
  9. set.seed(1234)

加载数据集并构建Seurat对象

  1. # this object was created following the mouse brain vignette
  2. # 加载10x Genomics的小鼠大脑的scATAC-seq数据
  3. tenx <- readRDS(file = "/home/dongwei/scATAC-seq/brain/10x/adult_mouse_brain.rds")
  4. tenx$tech <- '10x'
  5. tenx$celltype <- Idents(tenx)
  6. # 加载sci-ATAC-seq的小鼠大脑数据
  7. sci.metadata <- read.table(
  8. file = "/home/dongwei/scATAC-seq/brain/sci/cell_metadata.txt",
  9. header = TRUE,
  10. row.names = 1,
  11. sep = "\t"
  12. )
  13. # subset to include only the brain data
  14. sci.metadata <- sci.metadata[sci.metadata$tissue == 'PreFrontalCortex', ]
  15. sci.counts <- readRDS(file = "/home/dongwei/scATAC-seq/brain/sci/atac_matrix.binary.qc_filtered.rds")
  16. sci.counts <- sci.counts[, rownames(x = sci.metadata)]

数据预处理

在上述两个数据集中,sci-ATAC-seq数据是比对到小鼠mm9参考基因组的,而10x的数据是比对到小鼠mm10参考基因组的,因此这两个数据集中peaks的基因组坐标信息是不同的。我们可以使用rtracklayer包将mm9参考基因组的坐标信息转换到mm10中,并使用mm10的坐标更换sci-ATAC-seq数据中peaks的坐标,其中liftover转换的chain文件可以从UCSC官网进行下载。

  1. # 将peaks坐标信息转换成GRanges格式
  2. sci_peaks_mm9 <- StringToGRanges(regions = rownames(sci.counts), sep = c("_", "_"))
  3. # 导入mm9ToMm10.over.chain文件
  4. mm9_mm10 <- rtracklayer::import.chain("/home/dongwei/data/liftover/mm9ToMm10.over.chain")
  5. # 使用rtracklayer包中的liftOver函数转换坐标信息
  6. sci_peaks_mm10 <- rtracklayer::liftOver(x = sci_peaks_mm9, chain = mm9_mm10)
  7. names(sci_peaks_mm10) <- rownames(sci.counts)
  8. # discard any peaks that were mapped to >1 region in mm10
  9. correspondence <- S4Vectors::elementNROWS(sci_peaks_mm10)
  10. sci_peaks_mm10 <- sci_peaks_mm10[correspondence == 1]
  11. sci_peaks_mm10 <- unlist(sci_peaks_mm10)
  12. sci.counts <- sci.counts[names(sci_peaks_mm10), ]
  13. # rename peaks with mm10 coordinates
  14. rownames(sci.counts) <- GRangesToString(grange = sci_peaks_mm10)
  15. # create Seurat object and perform some basic QC filtering
  16. # 构建Seurat对象
  17. sci <- CreateSeuratObject(
  18. counts = sci.counts,
  19. meta.data = sci.metadata,
  20. assay = 'peaks',
  21. project = 'sci'
  22. )
  23. # 数据过滤
  24. sci.ds <- sci[, sci$nFeature_peaks > 2000 & sci$nCount_peaks > 5000 & !(sci$cell_label %in% c("Collisions", "Unknown"))]
  25. sci$tech <- 'sciATAC'
  26. # 使用RunTFIDF函数进行数据归一化
  27. sci <- RunTFIDF(sci)
  28. # 使用FindTopFeatures函数提取高变异的peaks
  29. sci <- FindTopFeatures(sci, min.cutoff = 50)
  30. # 使用RunSVD函数进行线性降维
  31. sci <- RunSVD(sci, n = 30, reduction.name = 'lsi', reduction.key = 'LSI_')
  32. # 使用RunUMAP函数进行非线性降维
  33. sci <- RunUMAP(sci, reduction = 'lsi', dims = 2:30)

现在,我们构建好了两个scATAC-seq对象,并且它们都含有基于相同的mm10参考基因组坐标系统得到的peaks信息。但是,由于这两个实验都单独进行了peak calling,因此这两个数据集中得到的peaks坐标不太可能完全重叠。为了在我们要整合的数据集中具有共同的特征,我们可以基于10x Genomics数据集对sci-ATAC-seq中peaks的reads进行计数,并使用这些计数创建一个新的assay。

  1. # find peaks that intersect in both datasets
  2. # 使用GetIntersectingFeatures函数提取两个数据集中重叠的peak区域
  3. intersecting.regions <- GetIntersectingFeatures(
  4. object.1 = sci,
  5. object.2 = tenx,
  6. sep.1 = c("-", "-"),
  7. sep.2 = c(":", "-")
  8. )
  9. # choose a subset of intersecting peaks
  10. peaks.use <- sample(intersecting.regions[[1]], size = 10000, replace = FALSE)
  11. # count fragments per cell overlapping the set of peaks in the 10x data
  12. # 使用FeatureMatrix函数对peaks中的reads进行计数
  13. sci_peaks_tenx <- FeatureMatrix(
  14. fragments = GetFragments(object = tenx, assay = 'peaks'),
  15. features = StringToGRanges(peaks.use),
  16. cells = colnames(tenx)
  17. )
  18. # create a new assay and add it to the 10x dataset
  19. # 使用CreateAssayObject函数新建一个assay对象
  20. tenx[['sciPeaks']] <- CreateAssayObject(counts = sci_peaks_tenx, min.cells = 1)
  21. # 数据归一化
  22. tenx <- RunTFIDF(object = tenx, assay = 'sciPeaks')

多样本scATAC-seq数据集的整合

在进行数据整合之前,我们最好先检查下是否存在数据集特异的差异,并将其删除。如果没有,我们可以简单地将多个对象进行合并而不执行整合。在本示例中,由于使用不同的测序技术,两个数据集之间存在很大的差异。我们可以使用Seurat v3中的数据整合方法来消除这种影响。

  1. # 先简单的将两个数据集进行合并看一下聚类的效果
  2. # Look at the data without integration first
  3. # 使用MergeWithRegions函数将两个数据对象进行合并
  4. unintegrated <- MergeWithRegions(
  5. object.1 = sci,
  6. object.2 = tenx,
  7. assay.1 = 'peaks',
  8. assay.2 = 'sciPeaks',
  9. sep.1 = c("-", "-"),
  10. sep.2 = c("-", "-")
  11. )
  12. # 对合并后数据进行归一化,特征选择和降维可视化
  13. unintegrated <- RunTFIDF(unintegrated)
  14. unintegrated <- FindTopFeatures(unintegrated, min.cutoff = 50)
  15. unintegrated <- RunSVD(unintegrated, n = 30, reduction.name = 'lsi', reduction.key = 'LSI_')
  16. unintegrated <- RunUMAP(unintegrated, reduction = 'lsi', dims = 2:30)
  17. p1 <- DimPlot(unintegrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Unintegrated")
  18. # 使用Seurat v3的数据整合方法进行数据集的整合
  19. # find integration anchors between 10x and sci-ATAC
  20. # 使用FindIntegrationAnchors函数识别共享的整合anchors
  21. anchors <- FindIntegrationAnchors(
  22. object.list = list(tenx, sci),
  23. anchor.features = peaks.use,
  24. assay = c('sciPeaks', 'peaks'),
  25. k.filter = NA
  26. )
  27. # integrate data and create a new merged object
  28. # 使用IntegrateData函数根据识别的anchors进行数据整合
  29. integrated <- IntegrateData(
  30. anchorset = anchors,
  31. weight.reduction = sci[['lsi']],
  32. dims = 2:30,
  33. preserve.order = TRUE
  34. )
  35. # we now have a "corrected" TF-IDF matrix, and can run LSI again on this corrected matrix
  36. # 对整合后的数据进行降维可视化
  37. integrated <- RunSVD(
  38. object = integrated,
  39. n = 30,
  40. reduction.name = 'integratedLSI'
  41. )
  42. integrated <- RunUMAP(
  43. object = integrated,
  44. dims = 2:30,
  45. reduction = 'integratedLSI'
  46. )
  47. p2 <- DimPlot(integrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Integrated")
  48. p1 + p2

使用Signac包进行单细胞ATAC-seq数据分析(三):scATAC-seq data integration - 图1

Label transfer

我们还可以使用Seurat v3中的Label transfer方法进行数据集的整合,它将数据从一个query数据集映射到另一个reference数据集中。在这里,我们通过将细胞类型标签从10x Genomics scATAC-seq数据映射到到sci-ATAC-seq数据中。

  1. # 使用FindTransferAnchors函数识别整合的anchors
  2. transfer.anchors <- FindTransferAnchors(
  3. reference = tenx,
  4. query = sci,
  5. reference.assay = 'sciPeaks',
  6. query.assay = 'peaks',
  7. reduction = 'cca',
  8. features = peaks.use,
  9. k.filter = NA
  10. )
  11. # 使用TransferData函数基于识别好的anchors进行数据映射整合
  12. predicted.id <- TransferData(
  13. anchorset = transfer.anchors,
  14. refdata = tenx$celltype,
  15. weight.reduction = sci[['lsi']],
  16. dims = 2:30
  17. )
  18. sci <- AddMetaData(
  19. object = sci,
  20. metadata = predicted.id
  21. )
  22. sci$predicted.id <- factor(sci$predicted.id, levels = levels(tenx$celltype)) # to make the colors match
  23. # 数据可视化
  24. p3 <- DimPlot(tenx, group.by = 'celltype', label = TRUE) + NoLegend() + ggplot2::ggtitle("Celltype labels 10x scATAC-seq")
  25. p4 <- DimPlot(sci, group.by = 'predicted.id', label = TRUE) + NoLegend() + ggplot2::ggtitle("Predicted labels sci-ATAC-seq")
  26. p3 + p4

使用Signac包进行单细胞ATAC-seq数据分析(三):scATAC-seq data integration - 图2

Integration with Harmony 使用Harmony包进行数据整合

Harmony需要一个对象作为输入,因此这里我们使用MergeWithRegions函数以coordinate-aware的方式将sci-ATAC-seq数据集和10x的scATAC-seq数据集进行合并,然后对合并后的对象进行LSI线性降维。数据降维后,我们可以使用RunHarmony函数调用Harmony的方法进行数据的整合,并提供用作分组变量的技术来消除sci-ATAC-seq和10x Genomics scATAC-seq数据集之间的批次差异。这会产生一组“校正”的LSI嵌入,可以进一步用于UMAP或tSNE降维,并进行细胞聚类分群。

  1. # 加载harmony包
  2. library(harmony)
  3. # 使用RunHarmony函数进行数据整合
  4. hm.integrated <- RunHarmony(
  5. object = unintegrated,
  6. group.by.vars = 'tech',
  7. reduction = 'lsi',
  8. assay.use = 'peaks',
  9. project.dim = FALSE
  10. )
  11. # re-compute the UMAP using corrected LSI embeddings
  12. # 数据降维可视化
  13. hm.integrated <- RunUMAP(hm.integrated, dims = 2:30, reduction = 'harmony')
  14. p5 <- DimPlot(hm.integrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Harmony integration")
  15. p1 + p5

使用Signac包进行单细胞ATAC-seq数据分析(三):scATAC-seq data integration - 图3

参考来源:https://satijalab.org/signac/articles/integration.html