single cell 转录组数据过滤和整合是一个较为复杂的问题,也是最基本的问题。记录一下自己整合数据的过程。更多知识分享请到 https://zouhua.top/

加载R包

  1. library(dplyr)
  2. library(Seurat)
  3. library(tibble)
  4. library(patchwork)
  5. library(ggplot2)
  6. library(data.table)
  7. library(harmony)

读取数据

  1. rm(list = ls())
  2. options(stringsAsFactors = F)
  3. options(future.globals.maxSize = 10000 * 1024^2)
  4. hour4 <- fread("../Study/RawData/4hours.dge.txt") %>%
  5. column_to_rownames("V1")
  6. hour12 <- fread("../Study/RawData/12hours.dge.txt") %>%
  7. column_to_rownames("V1")
  8. day2 <- fread("../Study/RawData/2days.dge.txt") %>%
  9. column_to_rownames("V1")
  10. day14 <- fread("../Study/RawData/14days.dge.txt") %>%
  11. column_to_rownames("V1")
  12. week6 <- fread("../Study/RawData/6weeks.dge.txt") %>%
  13. column_to_rownames("V1")
  14. control <- fread("../Study/RawData/control.dge.txt") %>%
  15. column_to_rownames("V1")
  16. phen <- fread("../Study/Phenotype/metadata.txt") %>%
  17. column_to_rownames("V1")

创建seurat对象

  1. CreateObject <- function(count=count,
  2. metadata=phen,
  3. proj="GSE139107"){
  4. sid <- intersect(rownames(metadata), colnames(count))
  5. metadata.cln <- metadata[rownames(metadata)%in%sid, ]
  6. count.cln <- count %>% dplyr::select(sid)
  7. res <- CreateSeuratObject(counts = count.cln,
  8. project = proj,
  9. assay = "RNA",
  10. min.cells = 3,
  11. min.features = 150,
  12. meta.data = metadata.cln)
  13. res[["Batch"]] <- proj
  14. return(res)
  15. }
  16. hour4_seurat <- CreateObject(count = hour4)
  17. hour12_seurat <- CreateObject(count = hour12)
  18. day2_seurat <- CreateObject(count = day2)
  19. day14_seurat <- CreateObject(count = day14)
  20. week6_seurat <- CreateObject(count = week6)
  21. control_seurat <- CreateObject(count = control)

合并对象

  1. seurat_object <- merge(x = control_seurat,
  2. y = c(hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat))
  3. rm(control_seurat, hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat)
  4. # save data
  5. dir_RDS <- "../Result/RDS"
  6. if(!dir.exists(dir_RDS)){
  7. dir.create(dir_RDS)
  8. }
  9. seurat_object_name <- paste0(dir_RDS, "/seurat_merge_all.RDS")
  10. saveRDS(seurat_object, file = seurat_object_name)

每组的细胞数量

  1. ggplot(data.frame(table(seurat_object@meta.data$orig.ident)) %>% setNames(c("samples", "Number")) ,aes(x=samples, y=Number))+
  2. geom_bar(stat = "identity") +
  3. ylab("The Number of cell per sample")+
  4. geom_hline(yintercept = 2000, linetype = 2) +
  5. theme_classic()+
  6. theme(axis.text.x = element_text(angle = 45, hjust = .5, vjust = .5))
  7. dim(seurat_object@meta.data)

过滤细胞

  • nFeature_RNA
  • nCount_RNA
  • mitochondrion gene
  1. # seurat_object <- readRDS("../Result/RDS/seurat_merge_all.RDS")
  2. # seurat_object@assays$RNA@counts@Dimnames[[1]] feature name
  3. seurat_object[["mitoRatio"]] <- PercentageFeatureSet(seurat_object, pattern = "^mt-")
  4. # before
  5. VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"),
  6. ncol = 2, pt.size = 0.2, group.by = "Group")
  7. # Get QC Thresholds
  8. ncount_q <- quantile(seurat_object@meta.data$nCount_RNA, c(0.025, 0.975))
  9. nfeature_q <- quantile(seurat_object@meta.data$nFeature_RNA, c(0.025, 0.975))
  10. mt_q <- quantile(Sobject@meta.data$percent.mt, c(0.025, 0.975))
  11. # QC Plots
  12. plot(seurat_object@meta.data$nCount_RNA, seurat_object@meta.data$nFeature_RNA,
  13. pch=16,
  14. cex=0.7,
  15. bty="n")
  16. abline(h=c(as.numeric(nfeature_q)[1], as.numeric(nfeature_q)[2]),
  17. v=c(as.numeric(ncount_q)[1], as.numeric(ncount_q)[2]),
  18. lty=2, lwd=2, col="red")
  19. # filter
  20. seurat_object <- subset(seurat_object, subset = nFeature_RNA > as.numeric(nfeature_q)[1] &
  21. nFeature_RNA < as.numeric(nfeature_q)[2] &
  22. nCount_RNA > as.numeric(ncount_q)[1] &
  23. nCount_RNA < as.numeric(ncount_q)[2] &
  24. percent.mt < as.numeric(mt_q)[2] )
  25. # after
  26. VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"), ncol = 2, pt.size = 0.2, group.by = "Group")
  27. # save data
  28. saveRDS(seurat_object, file = "../Result/RDS/seurat_merge_all_filter.RDS")

查看过滤后数据分布(整合前)

  1. # seurat_object <- readRDS("../Result/RDS/seurat_merge_all_filter.RDS")
  2. seurat_object <- FindVariableFeatures(object = seurat_object,
  3. selection.method = "vst",
  4. nfeatures = 2000,
  5. verbose = TRUE)
  6. seurat_object <- ScaleData(object = seurat_object, features = rownames(seurat_object))
  7. seurat_object <- RunPCA(seurat_object, npcs = 30, verbose = TRUE)
  8. seurat_object <- RunTSNE(seurat_object, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)
  9. seurat_object <- RunUMAP(seurat_object, reduction = "pca", dims = 1:30)
  10. DimPlot(seurat_object, reduction = "umap", group.by = "Group")

整合数据(结合harmony包的RunHarmony函数)

  1. split_seurat <- SplitObject(seurat_object, split.by = "Group")
  2. for (i in 1:length(split_seurat)) {
  3. split_seurat[[i]] <- SCTransform(split_seurat[[i]],
  4. vars.to.regress = c("mitoRatio"),
  5. verbose = TRUE)
  6. }
  7. seurat_features <- SelectIntegrationFeatures(object.list = split_seurat, nfeatures = 2000)
  8. seurat_merged <- merge(split_seurat[[1]], y = split_seurat[2:length(split_seurat)],
  9. project = "GSE139107", merge.data = TRUE)
  10. VariableFeatures(seurat_merged) <- seurat_features
  11. seurat_merged <- RunPCA(object = seurat_merged, assay = "SCT", npcs = 50)
  12. seurat_merged <- RunHarmony(object = seurat_merged,
  13. assay.use = "SCT",
  14. reduction = "pca",
  15. dims.use = 1:50,
  16. group.by.vars = "Group",
  17. plot_convergence = TRUE)
  18. # save integrated data
  19. seurat_merged <- RunTSNE(seurat_merged, reduction = "harmony", verbose = TRUE, check_duplicates = FALSE)
  20. seurat_merged <- RunUMAP(seurat_merged, reduction = "harmony", dims = 1:30)
  21. saveRDS(seurat_merged, file = "../Result/RDS/seurat_integrated.RDS")

整合数据(备选方案,基于Anchors策略)

  1. # Select the most variable features to use for integration
  2. data.features <- SelectIntegrationFeatures(object.list = split_seurat,
  3. nfeatures = 2000)
  4. # Prepare the SCT list object for integration
  5. data.seurat <- PrepSCTIntegration(object.list = split_seurat,
  6. anchor.features = data.features,
  7. verbose = FALSE)
  8. # Find best buddies - can take a while to run
  9. data.anchors <- FindIntegrationAnchors(object.list = data.seurat,
  10. normalization.method = "SCT",
  11. anchor.features = data.features,
  12. verbose = FALSE)
  13. # Integrate across conditions
  14. data.integrated <- IntegrateData(anchorset = data.anchors,
  15. normalization.method = "SCT",
  16. verbose = FALSE)
  17. data.integrated <- RunPCA(data.integrated, verbose = FALSE)
  18. data.integrated <- RunUMAP(data.integrated, dims = 1:30)
  19. data.integrated <- RunTSNE(data.integrated, reduction = "pca", dims = 1:30,
  20. check_duplicates = FALSE)
  21. # save integrated data
  22. seurat_merged <- RunTSNE(seurat_merged, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)
  23. seurat_merged <- RunUMAP(seurat_merged, reduction = "pca", dims = 1:30)
  24. saveRDS(data.integrated, "../../Result/RDS/seurat_integrated.sct.rds", compress = TRUE)

查看结果

  1. DimPlot(seurat_merged, reduction = "umap", group.by = "Group", label = TRUE, label.size = 6)
  2. DimPlot(data.integrated, reduction = "tsne", group.by = "Group", label = TRUE, label.size = 6)

Reference

  1. Harmony with Seurat SCTransform

参考文章如引起任何侵权问题,可以与我联系,谢谢。