前言
批次校正是数据预处理常见的步骤,在整合多批次数据的时候,需要考虑到这些非生物学因素对treatment的影响。更多知识分享请到 https://zouhua.top/。
加载R包
library(dplyr)
library(tibble)
library(Biobase)
library(limma)
library(ggplot2)
# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)
subgrp <- c("HC", "CP", "PDAC")
grp.col <- c("#568875", "#73FAFC", "#EE853D")
ExprSet <- readRDS("MS_Protein_ImputeExprSet.RDS")
Function
PCAFun <- function(dataset=ExprSet){
# dataset=ExprSet
metadata <- pData(dataset)
profile <- exprs(dataset)
# pca
pca <- prcomp(scale(t(profile), center = T, scale = T))
require(factoextra)
eig <- get_eig(pca)
# explains variable
explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
# principal component score of each sample
score <- inner_join(pca$x %>% data.frame() %>%
dplyr::select(c(1:2)) %>%
rownames_to_column("SampleID"),
metadata %>% rownames_to_column("SampleID"),
by = "SampleID") %>%
mutate(SubGroup=factor(SubGroup, levels = subgrp))
# PERMANOVA
require(vegan)
set.seed(123)
if(any(profile < 0)){
res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999)
}else{
res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)
}
adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))
pl <- ggplot(score, aes(x=PC1, y=PC2))+
#geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
geom_point(aes(color=SubGroup, shape=Omics), size=2.5)+
stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
labs(x=explains[1], y=explains[2])+
scale_color_manual(values = grp.col)+
# scale_fill_manual(name = "Condition",
# values = grp.col)+
scale_shape_manual(name = "Batch",
values = c(15, 16, 17))+
annotate("text", x = max(score$PC1) - 8,
y = min(score$PC1),
label = adn_res_format,
size = 6)+
#guides(color="none")+
theme_bw()+
theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
text = element_text(size = 8, color = "black", family = "serif"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank(),
legend.title = element_text(size = 11, color = "black", family = "serif"),
legend.text = element_text(size = 10, color = "black", family = "serif"),
legend.position = c(0, 0),
legend.justification = c(0, 0),
legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
return(pl)
}
RemoveBatchExprSet <- function(datset=ExprSet){
# datset=ExprSet
pheno <- pData(datset)
expr <- exprs(datset)
feature <- fData(datset)
# Remove batch effects
phen <- pheno %>% arrange(Omics)
prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
if(!any(rownames(phen) == colnames(prof))){
stop("The Order of SampleID in DiscoverySet is wrong")
}
# CA19-9 should be removed batch effects
prof_CA199 <- prof[rownames(prof) == "CA199", , F]
prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics)
prof_merge <- rbind(prof_rbe, prof_CA199)
fdf <- new("AnnotatedDataFrame", data=feature)
exprs_dis <- as.matrix(prof_merge)
adf_dis <- new("AnnotatedDataFrame", data=phen)
experimentData <- new("MIAME",
name="Hua", lab="Lab",
title="tumor Experiment",
abstract="The Mass Spectrometry ExpressionSet with overlap samples",
url="www.zouhua.top",
other=list(notes="The Intersect Proteins and Samples"))
res <- new("ExpressionSet",
exprs=exprs_dis,
phenoData=adf_dis,
featureData=fdf,
experimentData=experimentData)
return(res)
}
RemoveBatchExprSet_Design <- function(datset=ExprSet){
# datset=ExprSet
pheno <- pData(datset)
expr <- exprs(datset)
feature <- fData(datset)
# design
Design <- model.matrix(~SubGroup, data = pheno)
# Remove batch effects
phen <- pheno %>% arrange(Omics)
prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
if(!any(rownames(phen) == colnames(prof))){
stop("The Order of SampleID in DiscoverySet is wrong")
}
# CA19-9 should be removed batch effects
prof_CA199 <- prof[rownames(prof) == "CA199", , F]
prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics, design = Design)
prof_merge <- rbind(prof_rbe, prof_CA199)
fdf <- new("AnnotatedDataFrame", data=feature)
exprs_dis <- as.matrix(prof_merge)
adf_dis <- new("AnnotatedDataFrame", data=phen)
experimentData <- new("MIAME",
name="Hua", lab="Lab",
title="tumor Experiment",
abstract="The Mass Spectrometry ExpressionSet with overlap samples",
url="www.zouhua.top",
other=list(notes="The Intersect Proteins and Samples"))
res <- new("ExpressionSet",
exprs=exprs_dis,
phenoData=adf_dis,
featureData=fdf,
experimentData=experimentData)
return(res)
}
# Batch mean centering
RemoveBatchExprSet_BMC <- function(datset=ExprSet){
# datset=ExprSet
pheno <- pData(datset)
expr <- exprs(datset)
feature <- fData(datset)
# Remove batch effects
phen <- pheno %>% arrange(Omics)
prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
# CA19-9 should be removed batch effects
prof_CA199 <- prof[rownames(prof) == "CA199", , F]
prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
prof_list <- list()
batchlevels <- unique(phen$Omics)
for(i in 1:length(batchlevels)){
prof_list[[i]] <- scale(t(prof_NoCA199 %>%
dplyr::select(rownames(phen[phen$Omics == batchlevels[i], ]))),
center = TRUE, scale = FALSE)
}
prof_merge <- rbind(t(do.call(rbind, prof_list)), prof_CA199)
if(!any(rownames(phen) == colnames(prof_merge))){
stop("The Order of SampleID in DiscoverySet is wrong")
}
fdf <- new("AnnotatedDataFrame", data=feature)
exprs_dis <- as.matrix(prof_merge)
adf_dis <- new("AnnotatedDataFrame", data=phen)
experimentData <- new("MIAME",
name="Hua", lab="Lab",
title="tumor Experiment",
abstract="The Mass Spectrometry ExpressionSet with overlap samples",
url="www.zouhua.top",
other=list(notes="The Intersect Proteins and Samples"))
res <- new("ExpressionSet",
exprs=exprs_dis,
phenoData=adf_dis,
featureData=fdf,
experimentData=experimentData)
return(res)
}
# ComBat
RemoveBatchExprSet_ComBat <- function(datset=ExprSet){
# datset=ExprSet
pheno <- pData(datset)
expr <- exprs(datset)
feature <- fData(datset)
# design
Design <- model.matrix(~SubGroup, data = pheno)
# Remove batch effects
phen <- pheno %>% arrange(Omics)
prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
# CA19-9 should be removed batch effects
prof_CA199 <- prof[rownames(prof) == "CA199", , F]
prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
prof_combat <- sva::ComBat(prof_NoCA199, batch = phen$Omics, mod = Design, par.prior = F, prior.plots = F)
prof_merge <- rbind(prof_combat, prof_CA199)
if(!any(rownames(phen) == colnames(prof_merge))){
stop("The Order of SampleID in DiscoverySet is wrong")
}
fdf <- new("AnnotatedDataFrame", data=feature)
exprs_dis <- as.matrix(prof_merge)
adf_dis <- new("AnnotatedDataFrame", data=phen)
experimentData <- new("MIAME",
name="Hua", lab="Lab",
title="tumor Experiment",
abstract="The Mass Spectrometry ExpressionSet with overlap samples",
url="www.zouhua.top",
other=list(notes="The Intersect Proteins and Samples"))
res <- new("ExpressionSet",
exprs=exprs_dis,
phenoData=adf_dis,
featureData=fdf,
experimentData=experimentData)
return(res)
}
Run
ExprSet_RBE <- RemoveBatchExprSet(datset=ExprSet)
ExprSet_RBE_Design <- RemoveBatchExprSet_Design(datset=ExprSet)
ExprSet_RBE_BMC <- RemoveBatchExprSet_BMC(datset=ExprSet)
ExprSet_RBE_ComBat <- RemoveBatchExprSet_ComBat(datset=ExprSet)
cowplot::plot_grid(#PCAFun(dataset=ExprSet),
PCAFun(dataset=ExprSet_RBE),
PCAFun(dataset=ExprSet_RBE_Design),
PCAFun(dataset=ExprSet_RBE_BMC),
PCAFun(dataset=ExprSet_RBE_ComBat),
align = "hv",
ncol = 2,
labels = c(#"Origin",
"Batch corrected",
"Batch corrected with Design",
"Batch corrected with BMC",
"Batch corrected with ComBat"))
systemic information
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)
Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] DEP_1.12.0 gg.gap_1.4 ggpubr_0.4.0 gtsummary_1.4.0 vegan_2.5-6
[6] lattice_0.20-45 permute_0.9-5 factoextra_1.0.7 ggrepel_0.9.1.9999 sampling_2.8
[11] data.table_1.14.2 convert_1.64.0 marray_1.68.0 limma_3.46.0 Biobase_2.50.0
[16] BiocGenerics_0.36.0 ggplot2_3.3.5 tibble_3.1.6 dplyr_1.0.7
loaded via a namespace (and not attached):
[1] utf8_1.2.2 shinydashboard_0.7.1 reticulate_1.18
[4] gmm_1.6-5 tidyselect_1.1.1 htmlwidgets_1.5.4
[7] grid_4.0.2 BiocParallel_1.24.1 lpSolve_5.6.15
[10] norm_1.0-9.5 munsell_0.5.0 codetools_0.2-18
[13] preprocessCore_1.52.1 DT_0.20 umap_0.2.7.0
[16] withr_2.4.3 colorspace_2.0-2 knitr_1.37
[19] rstudioapi_0.13 stats4_4.0.2 robustbase_0.93-6
[22] bayesm_3.1-4 ggsignif_0.6.0 mzID_1.28.0
[25] MatrixGenerics_1.2.1 labeling_0.4.2 GenomeInfoDbData_1.2.4
[28] farver_2.1.0 vctrs_0.3.8 generics_0.1.1
[31] xfun_0.29 R6_2.5.1 doParallel_1.0.16
[34] GenomeInfoDb_1.26.4 clue_0.3-60 bitops_1.0-7
[37] DelayedArray_0.16.3 assertthat_0.2.1 promises_1.2.0.1
[40] scales_1.1.1 gtable_0.3.0 Cairo_1.5-12.2
[43] affy_1.68.0 sandwich_3.0-0 rlang_0.4.12
[46] mzR_2.24.1 GlobalOptions_0.1.2 splines_4.0.2
[49] rstatix_0.7.0 impute_1.64.0 broom_0.7.9
[52] BiocManager_1.30.16 yaml_2.2.1 abind_1.4-5
[55] crosstalk_1.2.0 backports_1.4.1 httpuv_1.6.4
[58] tensorA_0.36.2 tools_4.0.2 affyio_1.60.0
[61] ellipsis_0.3.2 jquerylib_0.1.4 RColorBrewer_1.1-2
[64] MSnbase_2.16.1 Rcpp_1.0.7 plyr_1.8.6
[67] zlibbioc_1.36.0 purrr_0.3.4 RCurl_1.98-1.3
[70] openssl_1.4.6 GetoptLong_1.0.5 cowplot_1.1.0
[73] S4Vectors_0.28.1 zoo_1.8-8 SummarizedExperiment_1.20.0
[76] haven_2.3.1 cluster_2.1.0 tinytex_0.36
[79] magrittr_2.0.1 RSpectra_0.16-0 openxlsx_4.2.3
[82] circlize_0.4.10 pcaMethods_1.80.0 mvtnorm_1.1-1
[85] ProtGenerics_1.22.0 matrixStats_0.61.0 xtable_1.8-4
[88] hms_1.1.1 mime_0.12 evaluate_0.14
[91] XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
[94] IRanges_2.24.1 shape_1.4.5 compiler_4.0.2
[97] gt_0.2.2 ncdf4_1.17 crayon_1.4.2
[100] htmltools_0.5.2 mgcv_1.8-38 later_1.2.0
[103] tzdb_0.2.0 tidyr_1.1.4 DBI_1.1.2
[106] ComplexHeatmap_2.6.2 MASS_7.3-54 tmvtnorm_1.4-10
[109] broom.helpers_1.3.0 compositions_2.0-2 Matrix_1.4-0
[112] car_3.0-10 readr_2.1.1 cli_3.1.0
[115] vsn_3.58.0 imputeLCMD_2.0 GenomicRanges_1.42.0
[118] forcats_0.5.0 pkgconfig_2.0.3 foreign_0.8-81
[121] MALDIquant_1.19.3 foreach_1.5.1 bslib_0.3.1
[124] XVector_0.30.0 stringr_1.4.0 digest_0.6.29
[127] rmarkdown_2.11 cellranger_1.1.0 curl_4.3.2
[130] shiny_1.7.1 rjson_0.2.20 lifecycle_1.0.1
[133] nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4
[136] askpass_1.1 fansi_0.5.0 pillar_1.6.4
[139] fastmap_1.1.0 DEoptimR_1.0-8 survival_3.2-13
[142] glue_1.6.0 zip_2.1.1 png_0.1-7
[145] iterators_1.0.13 stringi_1.4.6 sass_0.4.0
Reference
参考文章如引起任何侵权问题,可以与我联系,谢谢。