数据分析：数据预处理--批次校正（四） - 《数据分析》

前言
加载R包
Function
Run
systemic information
Reference

前言

批次校正是数据预处理常见的步骤，在整合多批次数据的时候，需要考虑到这些非生物学因素对treatment的影响。更多知识分享请到 https://zouhua.top/。

加载R包

library(dplyr)
library(tibble)
library(Biobase)
library(limma)
library(ggplot2)
# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)
subgrp <- c("HC", "CP", "PDAC")
grp.col <- c("#568875", "#73FAFC", "#EE853D")
ExprSet <- readRDS("MS_Protein_ImputeExprSet.RDS")

Function

PCAFun <- function(dataset=ExprSet){
  # dataset=ExprSet
  metadata <- pData(dataset)
  profile <- exprs(dataset)
  # pca 
  pca <- prcomp(scale(t(profile), center = T, scale = T))
  require(factoextra)
  eig <- get_eig(pca)
  # explains variable 
  explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
  # principal component score of each sample
  score <- inner_join(pca$x %>% data.frame() %>% 
                        dplyr::select(c(1:2)) %>% 
                        rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(SubGroup=factor(SubGroup, levels = subgrp))
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
  pl <- ggplot(score, aes(x=PC1, y=PC2))+
              #geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              geom_point(aes(color=SubGroup, shape=Omics), size=2.5)+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              labs(x=explains[1], y=explains[2])+
              scale_color_manual(values = grp.col)+
              # scale_fill_manual(name = "Condition", 
              #                   values = grp.col)+
              scale_shape_manual(name = "Batch", 
                                 values = c(15, 16, 17))+    
              annotate("text", x = max(score$PC1) - 8,
                       y = min(score$PC1),
                       label = adn_res_format,
                       size = 6)+ 
              #guides(color="none")+
              theme_bw()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}
RemoveBatchExprSet <- function(datset=ExprSet){
  # datset=ExprSet
  pheno <- pData(datset)
  expr <- exprs(datset)
  feature <- fData(datset)
  # Remove batch effects
  phen <- pheno %>% arrange(Omics)
  prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  if(!any(rownames(phen) == colnames(prof))){
    stop("The Order of SampleID in DiscoverySet is wrong")
  }
  # CA19-9 should be removed batch effects
  prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics)
  prof_merge <- rbind(prof_rbe, prof_CA199)
  fdf <- new("AnnotatedDataFrame", data=feature)
  exprs_dis <- as.matrix(prof_merge)
  adf_dis <-  new("AnnotatedDataFrame", data=phen)
  experimentData <- new("MIAME",
          name="Hua", lab="Lab",
          title="tumor Experiment",
          abstract="The Mass Spectrometry ExpressionSet with overlap samples",
          url="www.zouhua.top",
          other=list(notes="The Intersect Proteins and Samples"))
  res <- new("ExpressionSet", 
                exprs=exprs_dis,
                phenoData=adf_dis,
                featureData=fdf,
                experimentData=experimentData)
  return(res)  
}
RemoveBatchExprSet_Design <- function(datset=ExprSet){
  # datset=ExprSet
  pheno <- pData(datset)
  expr <- exprs(datset)
  feature <- fData(datset)
  # design
  Design <- model.matrix(~SubGroup, data = pheno)
  # Remove batch effects
  phen <- pheno %>% arrange(Omics)
  prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  if(!any(rownames(phen) == colnames(prof))){
    stop("The Order of SampleID in DiscoverySet is wrong")
  }
  # CA19-9 should be removed batch effects
  prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics, design = Design)
  prof_merge <- rbind(prof_rbe, prof_CA199)
  fdf <- new("AnnotatedDataFrame", data=feature)
  exprs_dis <- as.matrix(prof_merge)
  adf_dis <-  new("AnnotatedDataFrame", data=phen)
  experimentData <- new("MIAME",
          name="Hua", lab="Lab",
          title="tumor Experiment",
          abstract="The Mass Spectrometry ExpressionSet with overlap samples",
          url="www.zouhua.top",
          other=list(notes="The Intersect Proteins and Samples"))
  res <- new("ExpressionSet", 
                exprs=exprs_dis,
                phenoData=adf_dis,
                featureData=fdf,
                experimentData=experimentData)
  return(res)  
}
# Batch mean centering
RemoveBatchExprSet_BMC <- function(datset=ExprSet){
  # datset=ExprSet
  pheno <- pData(datset)
  expr <- exprs(datset)
  feature <- fData(datset)
  # Remove batch effects
  phen <- pheno %>% arrange(Omics)
  prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  # CA19-9 should be removed batch effects
  prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  prof_list <- list()
  batchlevels <- unique(phen$Omics)
  for(i in 1:length(batchlevels)){
    prof_list[[i]] <- scale(t(prof_NoCA199 %>% 
                                dplyr::select(rownames(phen[phen$Omics == batchlevels[i], ]))),
                            center = TRUE, scale = FALSE)
  }
  prof_merge <- rbind(t(do.call(rbind, prof_list)), prof_CA199)
  if(!any(rownames(phen) == colnames(prof_merge))){
    stop("The Order of SampleID in DiscoverySet is wrong")
  }  
  fdf <- new("AnnotatedDataFrame", data=feature)
  exprs_dis <- as.matrix(prof_merge)
  adf_dis <-  new("AnnotatedDataFrame", data=phen)
  experimentData <- new("MIAME",
          name="Hua", lab="Lab",
          title="tumor Experiment",
          abstract="The Mass Spectrometry ExpressionSet with overlap samples",
          url="www.zouhua.top",
          other=list(notes="The Intersect Proteins and Samples"))
  res <- new("ExpressionSet", 
             exprs=exprs_dis,
             phenoData=adf_dis,
             featureData=fdf,
             experimentData=experimentData)
  return(res)  
}
# ComBat
RemoveBatchExprSet_ComBat <- function(datset=ExprSet){
  # datset=ExprSet
  pheno <- pData(datset)
  expr <- exprs(datset)
  feature <- fData(datset)
  # design
  Design <- model.matrix(~SubGroup, data = pheno)  
  # Remove batch effects
  phen <- pheno %>% arrange(Omics)
  prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  # CA19-9 should be removed batch effects
  prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  prof_combat <- sva::ComBat(prof_NoCA199, batch = phen$Omics, mod = Design, par.prior = F, prior.plots = F)
  prof_merge <- rbind(prof_combat, prof_CA199)
  if(!any(rownames(phen) == colnames(prof_merge))){
    stop("The Order of SampleID in DiscoverySet is wrong")
  }  
  fdf <- new("AnnotatedDataFrame", data=feature)
  exprs_dis <- as.matrix(prof_merge)
  adf_dis <-  new("AnnotatedDataFrame", data=phen)
  experimentData <- new("MIAME",
          name="Hua", lab="Lab",
          title="tumor Experiment",
          abstract="The Mass Spectrometry ExpressionSet with overlap samples",
          url="www.zouhua.top",
          other=list(notes="The Intersect Proteins and Samples"))
  res <- new("ExpressionSet", 
             exprs=exprs_dis,
             phenoData=adf_dis,
             featureData=fdf,
             experimentData=experimentData)
  return(res)  
}

Run

ExprSet_RBE <- RemoveBatchExprSet(datset=ExprSet)
ExprSet_RBE_Design <- RemoveBatchExprSet_Design(datset=ExprSet)
ExprSet_RBE_BMC <- RemoveBatchExprSet_BMC(datset=ExprSet)
ExprSet_RBE_ComBat <- RemoveBatchExprSet_ComBat(datset=ExprSet)
cowplot::plot_grid(#PCAFun(dataset=ExprSet), 
                   PCAFun(dataset=ExprSet_RBE),
                   PCAFun(dataset=ExprSet_RBE_Design),
                   PCAFun(dataset=ExprSet_RBE_BMC),
                   PCAFun(dataset=ExprSet_RBE_ComBat),
                   align = "hv", 
                   ncol = 2,
                   labels = c(#"Origin", 
                              "Batch corrected", 
                              "Batch corrected with Design", 
                              "Batch corrected with BMC",
                              "Batch corrected with ComBat"))

systemic information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)
Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] DEP_1.12.0          gg.gap_1.4          ggpubr_0.4.0        gtsummary_1.4.0     vegan_2.5-6        
 [6] lattice_0.20-45     permute_0.9-5       factoextra_1.0.7    ggrepel_0.9.1.9999  sampling_2.8       
[11] data.table_1.14.2   convert_1.64.0      marray_1.68.0       limma_3.46.0        Biobase_2.50.0     
[16] BiocGenerics_0.36.0 ggplot2_3.3.5       tibble_3.1.6        dplyr_1.0.7        
loaded via a namespace (and not attached):
  [1] utf8_1.2.2                  shinydashboard_0.7.1        reticulate_1.18            
  [4] gmm_1.6-5                   tidyselect_1.1.1            htmlwidgets_1.5.4          
  [7] grid_4.0.2                  BiocParallel_1.24.1         lpSolve_5.6.15             
 [10] norm_1.0-9.5                munsell_0.5.0               codetools_0.2-18           
 [13] preprocessCore_1.52.1       DT_0.20                     umap_0.2.7.0               
 [16] withr_2.4.3                 colorspace_2.0-2            knitr_1.37                 
 [19] rstudioapi_0.13             stats4_4.0.2                robustbase_0.93-6          
 [22] bayesm_3.1-4                ggsignif_0.6.0              mzID_1.28.0                
 [25] MatrixGenerics_1.2.1        labeling_0.4.2              GenomeInfoDbData_1.2.4     
 [28] farver_2.1.0                vctrs_0.3.8                 generics_0.1.1             
 [31] xfun_0.29                   R6_2.5.1                    doParallel_1.0.16          
 [34] GenomeInfoDb_1.26.4         clue_0.3-60                 bitops_1.0-7               
 [37] DelayedArray_0.16.3         assertthat_0.2.1            promises_1.2.0.1           
 [40] scales_1.1.1                gtable_0.3.0                Cairo_1.5-12.2             
 [43] affy_1.68.0                 sandwich_3.0-0              rlang_0.4.12               
 [46] mzR_2.24.1                  GlobalOptions_0.1.2         splines_4.0.2              
 [49] rstatix_0.7.0               impute_1.64.0               broom_0.7.9                
 [52] BiocManager_1.30.16         yaml_2.2.1                  abind_1.4-5                
 [55] crosstalk_1.2.0             backports_1.4.1             httpuv_1.6.4               
 [58] tensorA_0.36.2              tools_4.0.2                 affyio_1.60.0              
 [61] ellipsis_0.3.2              jquerylib_0.1.4             RColorBrewer_1.1-2         
 [64] MSnbase_2.16.1              Rcpp_1.0.7                  plyr_1.8.6                 
 [67] zlibbioc_1.36.0             purrr_0.3.4                 RCurl_1.98-1.3             
 [70] openssl_1.4.6               GetoptLong_1.0.5            cowplot_1.1.0              
 [73] S4Vectors_0.28.1            zoo_1.8-8                   SummarizedExperiment_1.20.0
 [76] haven_2.3.1                 cluster_2.1.0               tinytex_0.36               
 [79] magrittr_2.0.1              RSpectra_0.16-0             openxlsx_4.2.3             
 [82] circlize_0.4.10             pcaMethods_1.80.0           mvtnorm_1.1-1              
 [85] ProtGenerics_1.22.0         matrixStats_0.61.0          xtable_1.8-4               
 [88] hms_1.1.1                   mime_0.12                   evaluate_0.14              
 [91] XML_3.99-0.6                rio_0.5.16                  readxl_1.3.1               
 [94] IRanges_2.24.1              shape_1.4.5                 compiler_4.0.2             
 [97] gt_0.2.2                    ncdf4_1.17                  crayon_1.4.2               
[100] htmltools_0.5.2             mgcv_1.8-38                 later_1.2.0                
[103] tzdb_0.2.0                  tidyr_1.1.4                 DBI_1.1.2                  
[106] ComplexHeatmap_2.6.2        MASS_7.3-54                 tmvtnorm_1.4-10            
[109] broom.helpers_1.3.0         compositions_2.0-2          Matrix_1.4-0               
[112] car_3.0-10                  readr_2.1.1                 cli_3.1.0                  
[115] vsn_3.58.0                  imputeLCMD_2.0              GenomicRanges_1.42.0       
[118] forcats_0.5.0               pkgconfig_2.0.3             foreign_0.8-81             
[121] MALDIquant_1.19.3           foreach_1.5.1               bslib_0.3.1                
[124] XVector_0.30.0              stringr_1.4.0               digest_0.6.29              
[127] rmarkdown_2.11              cellranger_1.1.0            curl_4.3.2                 
[130] shiny_1.7.1                 rjson_0.2.20                lifecycle_1.0.1            
[133] nlme_3.1-150                jsonlite_1.7.2              carData_3.0-4              
[136] askpass_1.1                 fansi_0.5.0                 pillar_1.6.4               
[139] fastmap_1.1.0               DEoptimR_1.0-8              survival_3.2-13            
[142] glue_1.6.0                  zip_2.1.1                   png_0.1-7                  
[145] iterators_1.0.13            stringi_1.4.6               sass_0.4.0

Reference

参考文章如引起任何侵权问题，可以与我联系，谢谢。