single cell 转录组数据过滤和整合是一个较为复杂的问题,也是最基本的问题。记录一下自己整合数据的过程。更多知识分享请到 https://zouhua.top/。
加载R包
library(dplyr)
library(Seurat)
library(tibble)
library(patchwork)
library(ggplot2)
library(data.table)
library(harmony)
读取数据
rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 10000 * 1024^2)
hour4 <- fread("../Study/RawData/4hours.dge.txt") %>%
column_to_rownames("V1")
hour12 <- fread("../Study/RawData/12hours.dge.txt") %>%
column_to_rownames("V1")
day2 <- fread("../Study/RawData/2days.dge.txt") %>%
column_to_rownames("V1")
day14 <- fread("../Study/RawData/14days.dge.txt") %>%
column_to_rownames("V1")
week6 <- fread("../Study/RawData/6weeks.dge.txt") %>%
column_to_rownames("V1")
control <- fread("../Study/RawData/control.dge.txt") %>%
column_to_rownames("V1")
phen <- fread("../Study/Phenotype/metadata.txt") %>%
column_to_rownames("V1")
创建seurat对象
CreateObject <- function(count=count,
metadata=phen,
proj="GSE139107"){
sid <- intersect(rownames(metadata), colnames(count))
metadata.cln <- metadata[rownames(metadata)%in%sid, ]
count.cln <- count %>% dplyr::select(sid)
res <- CreateSeuratObject(counts = count.cln,
project = proj,
assay = "RNA",
min.cells = 3,
min.features = 150,
meta.data = metadata.cln)
res[["Batch"]] <- proj
return(res)
}
hour4_seurat <- CreateObject(count = hour4)
hour12_seurat <- CreateObject(count = hour12)
day2_seurat <- CreateObject(count = day2)
day14_seurat <- CreateObject(count = day14)
week6_seurat <- CreateObject(count = week6)
control_seurat <- CreateObject(count = control)
合并对象
seurat_object <- merge(x = control_seurat,
y = c(hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat))
rm(control_seurat, hour4_seurat, hour12_seurat, day2_seurat, day14_seurat, week6_seurat)
# save data
dir_RDS <- "../Result/RDS"
if(!dir.exists(dir_RDS)){
dir.create(dir_RDS)
}
seurat_object_name <- paste0(dir_RDS, "/seurat_merge_all.RDS")
saveRDS(seurat_object, file = seurat_object_name)
每组的细胞数量
ggplot(data.frame(table(seurat_object@meta.data$orig.ident)) %>% setNames(c("samples", "Number")) ,aes(x=samples, y=Number))+
geom_bar(stat = "identity") +
ylab("The Number of cell per sample")+
geom_hline(yintercept = 2000, linetype = 2) +
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust = .5, vjust = .5))
dim(seurat_object@meta.data)
过滤细胞
- nFeature_RNA
- nCount_RNA
- mitochondrion gene
# seurat_object <- readRDS("../Result/RDS/seurat_merge_all.RDS")
# seurat_object@assays$RNA@counts@Dimnames[[1]] feature name
seurat_object[["mitoRatio"]] <- PercentageFeatureSet(seurat_object, pattern = "^mt-")
# before
VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"),
ncol = 2, pt.size = 0.2, group.by = "Group")
# Get QC Thresholds
ncount_q <- quantile(seurat_object@meta.data$nCount_RNA, c(0.025, 0.975))
nfeature_q <- quantile(seurat_object@meta.data$nFeature_RNA, c(0.025, 0.975))
mt_q <- quantile(Sobject@meta.data$percent.mt, c(0.025, 0.975))
# QC Plots
plot(seurat_object@meta.data$nCount_RNA, seurat_object@meta.data$nFeature_RNA,
pch=16,
cex=0.7,
bty="n")
abline(h=c(as.numeric(nfeature_q)[1], as.numeric(nfeature_q)[2]),
v=c(as.numeric(ncount_q)[1], as.numeric(ncount_q)[2]),
lty=2, lwd=2, col="red")
# filter
seurat_object <- subset(seurat_object, subset = nFeature_RNA > as.numeric(nfeature_q)[1] &
nFeature_RNA < as.numeric(nfeature_q)[2] &
nCount_RNA > as.numeric(ncount_q)[1] &
nCount_RNA < as.numeric(ncount_q)[2] &
percent.mt < as.numeric(mt_q)[2] )
# after
VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "mitoRatio"), ncol = 2, pt.size = 0.2, group.by = "Group")
# save data
saveRDS(seurat_object, file = "../Result/RDS/seurat_merge_all_filter.RDS")
查看过滤后数据分布(整合前)
# seurat_object <- readRDS("../Result/RDS/seurat_merge_all_filter.RDS")
seurat_object <- FindVariableFeatures(object = seurat_object,
selection.method = "vst",
nfeatures = 2000,
verbose = TRUE)
seurat_object <- ScaleData(object = seurat_object, features = rownames(seurat_object))
seurat_object <- RunPCA(seurat_object, npcs = 30, verbose = TRUE)
seurat_object <- RunTSNE(seurat_object, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)
seurat_object <- RunUMAP(seurat_object, reduction = "pca", dims = 1:30)
DimPlot(seurat_object, reduction = "umap", group.by = "Group")
整合数据(结合harmony包的RunHarmony函数)
split_seurat <- SplitObject(seurat_object, split.by = "Group")
for (i in 1:length(split_seurat)) {
split_seurat[[i]] <- SCTransform(split_seurat[[i]],
vars.to.regress = c("mitoRatio"),
verbose = TRUE)
}
seurat_features <- SelectIntegrationFeatures(object.list = split_seurat, nfeatures = 2000)
seurat_merged <- merge(split_seurat[[1]], y = split_seurat[2:length(split_seurat)],
project = "GSE139107", merge.data = TRUE)
VariableFeatures(seurat_merged) <- seurat_features
seurat_merged <- RunPCA(object = seurat_merged, assay = "SCT", npcs = 50)
seurat_merged <- RunHarmony(object = seurat_merged,
assay.use = "SCT",
reduction = "pca",
dims.use = 1:50,
group.by.vars = "Group",
plot_convergence = TRUE)
# save integrated data
seurat_merged <- RunTSNE(seurat_merged, reduction = "harmony", verbose = TRUE, check_duplicates = FALSE)
seurat_merged <- RunUMAP(seurat_merged, reduction = "harmony", dims = 1:30)
saveRDS(seurat_merged, file = "../Result/RDS/seurat_integrated.RDS")
整合数据(备选方案,基于Anchors策略)
# Select the most variable features to use for integration
data.features <- SelectIntegrationFeatures(object.list = split_seurat,
nfeatures = 2000)
# Prepare the SCT list object for integration
data.seurat <- PrepSCTIntegration(object.list = split_seurat,
anchor.features = data.features,
verbose = FALSE)
# Find best buddies - can take a while to run
data.anchors <- FindIntegrationAnchors(object.list = data.seurat,
normalization.method = "SCT",
anchor.features = data.features,
verbose = FALSE)
# Integrate across conditions
data.integrated <- IntegrateData(anchorset = data.anchors,
normalization.method = "SCT",
verbose = FALSE)
data.integrated <- RunPCA(data.integrated, verbose = FALSE)
data.integrated <- RunUMAP(data.integrated, dims = 1:30)
data.integrated <- RunTSNE(data.integrated, reduction = "pca", dims = 1:30,
check_duplicates = FALSE)
# save integrated data
seurat_merged <- RunTSNE(seurat_merged, reduction = "pca", verbose = TRUE, check_duplicates = FALSE)
seurat_merged <- RunUMAP(seurat_merged, reduction = "pca", dims = 1:30)
saveRDS(data.integrated, "../../Result/RDS/seurat_integrated.sct.rds", compress = TRUE)
查看结果
DimPlot(seurat_merged, reduction = "umap", group.by = "Group", label = TRUE, label.size = 6)
DimPlot(data.integrated, reduction = "tsne", group.by = "Group", label = TRUE, label.size = 6)
Reference
参考文章如引起任何侵权问题,可以与我联系,谢谢。