single cell 转录组数据过滤和整合是一个较为复杂的问题,也是最基本的问题。记录一下自己整合数据的过程。更多知识分享请到 https://zouhua.top/。
加载R包
library(dplyr)library(Seurat)library(tibble)library(patchwork)library(ggplot2)library(data.table)library(harmony)
读取数据
rm(list = ls())options(stringsAsFactors = F)options(future.globals.maxSize = 10000 * 1024^2)hour4 <- fread("../Study/RawData/4hours.dge.txt") %>%column_to_rownames("V1")hour12 <- fread("../Study/RawData/12hours.dge.txt") %>%column_to_rownames("V1")day2 <- fread("../Study/RawData/2days.dge.txt") %>%column_to_rownames("V1")day14 <- fread("../Study/RawData/14days.dge.txt") %>%column_to_rownames("V1")week6 <- fread("../Study/RawData/6weeks.dge.txt") %>%column_to_rownames("V1")control <- fread("../Study/RawData/control.dge.txt") %>%column_to_rownames("V1")phen <- fread("../Study/Phenotype/metadata.txt") %>%column_to_rownames("V1")
创建seurat对象
CreateObject <- function(count=count,metadata=phen,proj="GSE139107"){sid <- intersect(rownames(metadata), colnames(count))metadata.cln <- metadata[rownames(metadata)%in%sid, ]count.cln <- count %>% dplyr::select(sid)res <- CreateSeuratObject(counts = count.cln,project = proj,assay = "RNA",min.cells = 3,min.features = 150,meta.data = metadata.cln)res[["Batch"]] <- projreturn(res)}hour4_seurat <- CreateObject(count = hour4)hour12_seurat <- CreateObject(count = hour12)day2_seurat <- CreateObject(count = day2)day14_seurat <- CreateObject(count = day14)week6_seurat <- CreateObject(count = week6)control_seurat <- CreateObject(count = control)
合并对象
seurat_object <- merge(x = control_seurat,y = c(hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat))rm(control_seurat, hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat)# save datadir_RDS <- "../Result/RDS"if(!dir.exists(dir_RDS)){dir.create(dir_RDS)}seurat_object_name <- paste0(dir_RDS, "/seurat_merge_all.RDS")saveRDS(seurat_object, file = seurat_object_name)
每组的细胞数量
ggplot(data.frame(table(seurat_object@meta.data$orig.ident)) %>% setNames(c("samples", "Number")) ,aes(x=samples, y=Number))+geom_bar(stat = "identity") +ylab("The Number of cell per sample")+geom_hline(yintercept = 2000, linetype = 2) +theme_classic()+theme(axis.text.x = element_text(angle = 45, hjust = .5, vjust = .5))dim(seurat_object@meta.data)
过滤细胞
- nFeature_RNA
- nCount_RNA
- mitochondrion gene
# seurat_object <- readRDS("../Result/RDS/seurat_merge_all.RDS")# seurat_object@assays$RNA@counts@Dimnames[[1]] feature nameseurat_object[["mitoRatio"]] <- PercentageFeatureSet(seurat_object, pattern = "^mt-")# beforeVlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"),ncol = 2, pt.size = 0.2, group.by = "Group")# Get QC Thresholdsncount_q <- quantile(seurat_object@meta.data$nCount_RNA, c(0.025, 0.975))nfeature_q <- quantile(seurat_object@meta.data$nFeature_RNA, c(0.025, 0.975))mt_q <- quantile(Sobject@meta.data$percent.mt, c(0.025, 0.975))# QC Plotsplot(seurat_object@meta.data$nCount_RNA, seurat_object@meta.data$nFeature_RNA,pch=16,cex=0.7,bty="n")abline(h=c(as.numeric(nfeature_q)[1], as.numeric(nfeature_q)[2]),v=c(as.numeric(ncount_q)[1], as.numeric(ncount_q)[2]),lty=2, lwd=2, col="red")# filterseurat_object <- subset(seurat_object, subset = nFeature_RNA > as.numeric(nfeature_q)[1] &nFeature_RNA < as.numeric(nfeature_q)[2] &nCount_RNA > as.numeric(ncount_q)[1] &nCount_RNA < as.numeric(ncount_q)[2] &percent.mt < as.numeric(mt_q)[2] )# afterVlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"), ncol = 2, pt.size = 0.2, group.by = "Group")# save datasaveRDS(seurat_object, file = "../Result/RDS/seurat_merge_all_filter.RDS")
查看过滤后数据分布(整合前)
# seurat_object <- readRDS("../Result/RDS/seurat_merge_all_filter.RDS")seurat_object <- FindVariableFeatures(object = seurat_object,selection.method = "vst",nfeatures = 2000,verbose = TRUE)seurat_object <- ScaleData(object = seurat_object, features = rownames(seurat_object))seurat_object <- RunPCA(seurat_object, npcs = 30, verbose = TRUE)seurat_object <- RunTSNE(seurat_object, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)seurat_object <- RunUMAP(seurat_object, reduction = "pca", dims = 1:30)DimPlot(seurat_object, reduction = "umap", group.by = "Group")
整合数据(结合harmony包的RunHarmony函数)
split_seurat <- SplitObject(seurat_object, split.by = "Group")for (i in 1:length(split_seurat)) {split_seurat[[i]] <- SCTransform(split_seurat[[i]],vars.to.regress = c("mitoRatio"),verbose = TRUE)}seurat_features <- SelectIntegrationFeatures(object.list = split_seurat, nfeatures = 2000)seurat_merged <- merge(split_seurat[[1]], y = split_seurat[2:length(split_seurat)],project = "GSE139107", merge.data = TRUE)VariableFeatures(seurat_merged) <- seurat_featuresseurat_merged <- RunPCA(object = seurat_merged, assay = "SCT", npcs = 50)seurat_merged <- RunHarmony(object = seurat_merged,assay.use = "SCT",reduction = "pca",dims.use = 1:50,group.by.vars = "Group",plot_convergence = TRUE)# save integrated dataseurat_merged <- RunTSNE(seurat_merged, reduction = "harmony", verbose = TRUE, check_duplicates = FALSE)seurat_merged <- RunUMAP(seurat_merged, reduction = "harmony", dims = 1:30)saveRDS(seurat_merged, file = "../Result/RDS/seurat_integrated.RDS")
整合数据(备选方案,基于Anchors策略)
# Select the most variable features to use for integrationdata.features <- SelectIntegrationFeatures(object.list = split_seurat,nfeatures = 2000)# Prepare the SCT list object for integrationdata.seurat <- PrepSCTIntegration(object.list = split_seurat,anchor.features = data.features,verbose = FALSE)# Find best buddies - can take a while to rundata.anchors <- FindIntegrationAnchors(object.list = data.seurat,normalization.method = "SCT",anchor.features = data.features,verbose = FALSE)# Integrate across conditionsdata.integrated <- IntegrateData(anchorset = data.anchors,normalization.method = "SCT",verbose = FALSE)data.integrated <- RunPCA(data.integrated, verbose = FALSE)data.integrated <- RunUMAP(data.integrated, dims = 1:30)data.integrated <- RunTSNE(data.integrated, reduction = "pca", dims = 1:30,check_duplicates = FALSE)# save integrated dataseurat_merged <- RunTSNE(seurat_merged, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)seurat_merged <- RunUMAP(seurat_merged, reduction = "pca", dims = 1:30)saveRDS(data.integrated, "../../Result/RDS/seurat_integrated.sct.rds", compress = TRUE)
查看结果
DimPlot(seurat_merged, reduction = "umap", group.by = "Group", label = TRUE, label.size = 6)DimPlot(data.integrated, reduction = "tsne", group.by = "Group", label = TRUE, label.size = 6)
Reference
参考文章如引起任何侵权问题,可以与我联系,谢谢。
