前言
批次校正是数据预处理常见的步骤,在整合多批次数据的时候,需要考虑到这些非生物学因素对treatment的影响。更多知识分享请到 https://zouhua.top/。
加载R包
library(dplyr)library(tibble)library(Biobase)library(limma)library(ggplot2)# rm(list = ls())options(stringsAsFactors = F)options(future.globals.maxSize = 1000 * 1024^2)subgrp <- c("HC", "CP", "PDAC")grp.col <- c("#568875", "#73FAFC", "#EE853D")ExprSet <- readRDS("MS_Protein_ImputeExprSet.RDS")
Function
PCAFun <- function(dataset=ExprSet){# dataset=ExprSetmetadata <- pData(dataset)profile <- exprs(dataset)# pcapca <- prcomp(scale(t(profile), center = T, scale = T))require(factoextra)eig <- get_eig(pca)# explains variableexplains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")# principal component score of each samplescore <- inner_join(pca$x %>% data.frame() %>%dplyr::select(c(1:2)) %>%rownames_to_column("SampleID"),metadata %>% rownames_to_column("SampleID"),by = "SampleID") %>%mutate(SubGroup=factor(SubGroup, levels = subgrp))# PERMANOVArequire(vegan)set.seed(123)if(any(profile < 0)){res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999)}else{res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)}adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))pl <- ggplot(score, aes(x=PC1, y=PC2))+#geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+geom_point(aes(color=SubGroup, shape=Omics), size=2.5)+stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+labs(x=explains[1], y=explains[2])+scale_color_manual(values = grp.col)+# scale_fill_manual(name = "Condition",# values = grp.col)+scale_shape_manual(name = "Batch",values = c(15, 16, 17))+annotate("text", x = max(score$PC1) - 8,y = min(score$PC1),label = adn_res_format,size = 6)+#guides(color="none")+theme_bw()+theme(axis.title = element_text(size = 10, color = "black", face = "bold"),axis.text = element_text(size = 9, color = "black"),text = element_text(size = 8, color = "black", family = "serif"),strip.text = element_text(size = 9, color = "black", face = "bold"),panel.grid = element_blank(),legend.title = element_text(size = 11, color = "black", family = "serif"),legend.text = element_text(size = 10, color = "black", family = "serif"),legend.position = c(0, 0),legend.justification = c(0, 0),legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))return(pl)}RemoveBatchExprSet <- function(datset=ExprSet){# datset=ExprSetpheno <- pData(datset)expr <- exprs(datset)feature <- fData(datset)# Remove batch effectsphen <- pheno %>% arrange(Omics)prof <- data.frame(expr) %>% dplyr::select(rownames(phen))if(!any(rownames(phen) == colnames(prof))){stop("The Order of SampleID in DiscoverySet is wrong")}# CA19-9 should be removed batch effectsprof_CA199 <- prof[rownames(prof) == "CA199", , F]prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics)prof_merge <- rbind(prof_rbe, prof_CA199)fdf <- new("AnnotatedDataFrame", data=feature)exprs_dis <- as.matrix(prof_merge)adf_dis <- new("AnnotatedDataFrame", data=phen)experimentData <- new("MIAME",name="Hua", lab="Lab",title="tumor Experiment",abstract="The Mass Spectrometry ExpressionSet with overlap samples",url="www.zouhua.top",other=list(notes="The Intersect Proteins and Samples"))res <- new("ExpressionSet",exprs=exprs_dis,phenoData=adf_dis,featureData=fdf,experimentData=experimentData)return(res)}RemoveBatchExprSet_Design <- function(datset=ExprSet){# datset=ExprSetpheno <- pData(datset)expr <- exprs(datset)feature <- fData(datset)# designDesign <- model.matrix(~SubGroup, data = pheno)# Remove batch effectsphen <- pheno %>% arrange(Omics)prof <- data.frame(expr) %>% dplyr::select(rownames(phen))if(!any(rownames(phen) == colnames(prof))){stop("The Order of SampleID in DiscoverySet is wrong")}# CA19-9 should be removed batch effectsprof_CA199 <- prof[rownames(prof) == "CA199", , F]prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics, design = Design)prof_merge <- rbind(prof_rbe, prof_CA199)fdf <- new("AnnotatedDataFrame", data=feature)exprs_dis <- as.matrix(prof_merge)adf_dis <- new("AnnotatedDataFrame", data=phen)experimentData <- new("MIAME",name="Hua", lab="Lab",title="tumor Experiment",abstract="The Mass Spectrometry ExpressionSet with overlap samples",url="www.zouhua.top",other=list(notes="The Intersect Proteins and Samples"))res <- new("ExpressionSet",exprs=exprs_dis,phenoData=adf_dis,featureData=fdf,experimentData=experimentData)return(res)}# Batch mean centeringRemoveBatchExprSet_BMC <- function(datset=ExprSet){# datset=ExprSetpheno <- pData(datset)expr <- exprs(datset)feature <- fData(datset)# Remove batch effectsphen <- pheno %>% arrange(Omics)prof <- data.frame(expr) %>% dplyr::select(rownames(phen))# CA19-9 should be removed batch effectsprof_CA199 <- prof[rownames(prof) == "CA199", , F]prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]prof_list <- list()batchlevels <- unique(phen$Omics)for(i in 1:length(batchlevels)){prof_list[[i]] <- scale(t(prof_NoCA199 %>%dplyr::select(rownames(phen[phen$Omics == batchlevels[i], ]))),center = TRUE, scale = FALSE)}prof_merge <- rbind(t(do.call(rbind, prof_list)), prof_CA199)if(!any(rownames(phen) == colnames(prof_merge))){stop("The Order of SampleID in DiscoverySet is wrong")}fdf <- new("AnnotatedDataFrame", data=feature)exprs_dis <- as.matrix(prof_merge)adf_dis <- new("AnnotatedDataFrame", data=phen)experimentData <- new("MIAME",name="Hua", lab="Lab",title="tumor Experiment",abstract="The Mass Spectrometry ExpressionSet with overlap samples",url="www.zouhua.top",other=list(notes="The Intersect Proteins and Samples"))res <- new("ExpressionSet",exprs=exprs_dis,phenoData=adf_dis,featureData=fdf,experimentData=experimentData)return(res)}# ComBatRemoveBatchExprSet_ComBat <- function(datset=ExprSet){# datset=ExprSetpheno <- pData(datset)expr <- exprs(datset)feature <- fData(datset)# designDesign <- model.matrix(~SubGroup, data = pheno)# Remove batch effectsphen <- pheno %>% arrange(Omics)prof <- data.frame(expr) %>% dplyr::select(rownames(phen))# CA19-9 should be removed batch effectsprof_CA199 <- prof[rownames(prof) == "CA199", , F]prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]prof_combat <- sva::ComBat(prof_NoCA199, batch = phen$Omics, mod = Design, par.prior = F, prior.plots = F)prof_merge <- rbind(prof_combat, prof_CA199)if(!any(rownames(phen) == colnames(prof_merge))){stop("The Order of SampleID in DiscoverySet is wrong")}fdf <- new("AnnotatedDataFrame", data=feature)exprs_dis <- as.matrix(prof_merge)adf_dis <- new("AnnotatedDataFrame", data=phen)experimentData <- new("MIAME",name="Hua", lab="Lab",title="tumor Experiment",abstract="The Mass Spectrometry ExpressionSet with overlap samples",url="www.zouhua.top",other=list(notes="The Intersect Proteins and Samples"))res <- new("ExpressionSet",exprs=exprs_dis,phenoData=adf_dis,featureData=fdf,experimentData=experimentData)return(res)}
Run
ExprSet_RBE <- RemoveBatchExprSet(datset=ExprSet)ExprSet_RBE_Design <- RemoveBatchExprSet_Design(datset=ExprSet)ExprSet_RBE_BMC <- RemoveBatchExprSet_BMC(datset=ExprSet)ExprSet_RBE_ComBat <- RemoveBatchExprSet_ComBat(datset=ExprSet)cowplot::plot_grid(#PCAFun(dataset=ExprSet),PCAFun(dataset=ExprSet_RBE),PCAFun(dataset=ExprSet_RBE_Design),PCAFun(dataset=ExprSet_RBE_BMC),PCAFun(dataset=ExprSet_RBE_ComBat),align = "hv",ncol = 2,labels = c(#"Origin","Batch corrected","Batch corrected with Design","Batch corrected with BMC","Batch corrected with ComBat"))
systemic information
sessionInfo()
R version 4.0.2 (2020-06-22)Platform: x86_64-conda_cos6-linux-gnu (64-bit)Running under: CentOS Linux 8 (Core)Matrix products: defaultBLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.solocale:[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=Cattached base packages:[1] parallel stats graphics grDevices utils datasets methods baseother attached packages:[1] DEP_1.12.0 gg.gap_1.4 ggpubr_0.4.0 gtsummary_1.4.0 vegan_2.5-6[6] lattice_0.20-45 permute_0.9-5 factoextra_1.0.7 ggrepel_0.9.1.9999 sampling_2.8[11] data.table_1.14.2 convert_1.64.0 marray_1.68.0 limma_3.46.0 Biobase_2.50.0[16] BiocGenerics_0.36.0 ggplot2_3.3.5 tibble_3.1.6 dplyr_1.0.7loaded via a namespace (and not attached):[1] utf8_1.2.2 shinydashboard_0.7.1 reticulate_1.18[4] gmm_1.6-5 tidyselect_1.1.1 htmlwidgets_1.5.4[7] grid_4.0.2 BiocParallel_1.24.1 lpSolve_5.6.15[10] norm_1.0-9.5 munsell_0.5.0 codetools_0.2-18[13] preprocessCore_1.52.1 DT_0.20 umap_0.2.7.0[16] withr_2.4.3 colorspace_2.0-2 knitr_1.37[19] rstudioapi_0.13 stats4_4.0.2 robustbase_0.93-6[22] bayesm_3.1-4 ggsignif_0.6.0 mzID_1.28.0[25] MatrixGenerics_1.2.1 labeling_0.4.2 GenomeInfoDbData_1.2.4[28] farver_2.1.0 vctrs_0.3.8 generics_0.1.1[31] xfun_0.29 R6_2.5.1 doParallel_1.0.16[34] GenomeInfoDb_1.26.4 clue_0.3-60 bitops_1.0-7[37] DelayedArray_0.16.3 assertthat_0.2.1 promises_1.2.0.1[40] scales_1.1.1 gtable_0.3.0 Cairo_1.5-12.2[43] affy_1.68.0 sandwich_3.0-0 rlang_0.4.12[46] mzR_2.24.1 GlobalOptions_0.1.2 splines_4.0.2[49] rstatix_0.7.0 impute_1.64.0 broom_0.7.9[52] BiocManager_1.30.16 yaml_2.2.1 abind_1.4-5[55] crosstalk_1.2.0 backports_1.4.1 httpuv_1.6.4[58] tensorA_0.36.2 tools_4.0.2 affyio_1.60.0[61] ellipsis_0.3.2 jquerylib_0.1.4 RColorBrewer_1.1-2[64] MSnbase_2.16.1 Rcpp_1.0.7 plyr_1.8.6[67] zlibbioc_1.36.0 purrr_0.3.4 RCurl_1.98-1.3[70] openssl_1.4.6 GetoptLong_1.0.5 cowplot_1.1.0[73] S4Vectors_0.28.1 zoo_1.8-8 SummarizedExperiment_1.20.0[76] haven_2.3.1 cluster_2.1.0 tinytex_0.36[79] magrittr_2.0.1 RSpectra_0.16-0 openxlsx_4.2.3[82] circlize_0.4.10 pcaMethods_1.80.0 mvtnorm_1.1-1[85] ProtGenerics_1.22.0 matrixStats_0.61.0 xtable_1.8-4[88] hms_1.1.1 mime_0.12 evaluate_0.14[91] XML_3.99-0.6 rio_0.5.16 readxl_1.3.1[94] IRanges_2.24.1 shape_1.4.5 compiler_4.0.2[97] gt_0.2.2 ncdf4_1.17 crayon_1.4.2[100] htmltools_0.5.2 mgcv_1.8-38 later_1.2.0[103] tzdb_0.2.0 tidyr_1.1.4 DBI_1.1.2[106] ComplexHeatmap_2.6.2 MASS_7.3-54 tmvtnorm_1.4-10[109] broom.helpers_1.3.0 compositions_2.0-2 Matrix_1.4-0[112] car_3.0-10 readr_2.1.1 cli_3.1.0[115] vsn_3.58.0 imputeLCMD_2.0 GenomicRanges_1.42.0[118] forcats_0.5.0 pkgconfig_2.0.3 foreign_0.8-81[121] MALDIquant_1.19.3 foreach_1.5.1 bslib_0.3.1[124] XVector_0.30.0 stringr_1.4.0 digest_0.6.29[127] rmarkdown_2.11 cellranger_1.1.0 curl_4.3.2[130] shiny_1.7.1 rjson_0.2.20 lifecycle_1.0.1[133] nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4[136] askpass_1.1 fansi_0.5.0 pillar_1.6.4[139] fastmap_1.1.0 DEoptimR_1.0-8 survival_3.2-13[142] glue_1.6.0 zip_2.1.1 png_0.1-7[145] iterators_1.0.13 stringi_1.4.6 sass_0.4.0
Reference
参考文章如引起任何侵权问题,可以与我联系,谢谢。
