前言

批次校正是数据预处理常见的步骤,在整合多批次数据的时候,需要考虑到这些非生物学因素对treatment的影响。更多知识分享请到 https://zouhua.top/

加载R包

  1. library(dplyr)
  2. library(tibble)
  3. library(Biobase)
  4. library(limma)
  5. library(ggplot2)
  6. # rm(list = ls())
  7. options(stringsAsFactors = F)
  8. options(future.globals.maxSize = 1000 * 1024^2)
  9. subgrp <- c("HC", "CP", "PDAC")
  10. grp.col <- c("#568875", "#73FAFC", "#EE853D")
  11. ExprSet <- readRDS("MS_Protein_ImputeExprSet.RDS")

Function

  1. PCAFun <- function(dataset=ExprSet){
  2. # dataset=ExprSet
  3. metadata <- pData(dataset)
  4. profile <- exprs(dataset)
  5. # pca
  6. pca <- prcomp(scale(t(profile), center = T, scale = T))
  7. require(factoextra)
  8. eig <- get_eig(pca)
  9. # explains variable
  10. explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
  11. # principal component score of each sample
  12. score <- inner_join(pca$x %>% data.frame() %>%
  13. dplyr::select(c(1:2)) %>%
  14. rownames_to_column("SampleID"),
  15. metadata %>% rownames_to_column("SampleID"),
  16. by = "SampleID") %>%
  17. mutate(SubGroup=factor(SubGroup, levels = subgrp))
  18. # PERMANOVA
  19. require(vegan)
  20. set.seed(123)
  21. if(any(profile < 0)){
  22. res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999)
  23. }else{
  24. res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)
  25. }
  26. adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  27. adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  28. signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  29. adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
  30. atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))
  31. pl <- ggplot(score, aes(x=PC1, y=PC2))+
  32. #geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
  33. geom_point(aes(color=SubGroup, shape=Omics), size=2.5)+
  34. stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
  35. labs(x=explains[1], y=explains[2])+
  36. scale_color_manual(values = grp.col)+
  37. # scale_fill_manual(name = "Condition",
  38. # values = grp.col)+
  39. scale_shape_manual(name = "Batch",
  40. values = c(15, 16, 17))+
  41. annotate("text", x = max(score$PC1) - 8,
  42. y = min(score$PC1),
  43. label = adn_res_format,
  44. size = 6)+
  45. #guides(color="none")+
  46. theme_bw()+
  47. theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
  48. axis.text = element_text(size = 9, color = "black"),
  49. text = element_text(size = 8, color = "black", family = "serif"),
  50. strip.text = element_text(size = 9, color = "black", face = "bold"),
  51. panel.grid = element_blank(),
  52. legend.title = element_text(size = 11, color = "black", family = "serif"),
  53. legend.text = element_text(size = 10, color = "black", family = "serif"),
  54. legend.position = c(0, 0),
  55. legend.justification = c(0, 0),
  56. legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  57. return(pl)
  58. }
  59. RemoveBatchExprSet <- function(datset=ExprSet){
  60. # datset=ExprSet
  61. pheno <- pData(datset)
  62. expr <- exprs(datset)
  63. feature <- fData(datset)
  64. # Remove batch effects
  65. phen <- pheno %>% arrange(Omics)
  66. prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  67. if(!any(rownames(phen) == colnames(prof))){
  68. stop("The Order of SampleID in DiscoverySet is wrong")
  69. }
  70. # CA19-9 should be removed batch effects
  71. prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  72. prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  73. prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics)
  74. prof_merge <- rbind(prof_rbe, prof_CA199)
  75. fdf <- new("AnnotatedDataFrame", data=feature)
  76. exprs_dis <- as.matrix(prof_merge)
  77. adf_dis <- new("AnnotatedDataFrame", data=phen)
  78. experimentData <- new("MIAME",
  79. name="Hua", lab="Lab",
  80. title="tumor Experiment",
  81. abstract="The Mass Spectrometry ExpressionSet with overlap samples",
  82. url="www.zouhua.top",
  83. other=list(notes="The Intersect Proteins and Samples"))
  84. res <- new("ExpressionSet",
  85. exprs=exprs_dis,
  86. phenoData=adf_dis,
  87. featureData=fdf,
  88. experimentData=experimentData)
  89. return(res)
  90. }
  91. RemoveBatchExprSet_Design <- function(datset=ExprSet){
  92. # datset=ExprSet
  93. pheno <- pData(datset)
  94. expr <- exprs(datset)
  95. feature <- fData(datset)
  96. # design
  97. Design <- model.matrix(~SubGroup, data = pheno)
  98. # Remove batch effects
  99. phen <- pheno %>% arrange(Omics)
  100. prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  101. if(!any(rownames(phen) == colnames(prof))){
  102. stop("The Order of SampleID in DiscoverySet is wrong")
  103. }
  104. # CA19-9 should be removed batch effects
  105. prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  106. prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  107. prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics, design = Design)
  108. prof_merge <- rbind(prof_rbe, prof_CA199)
  109. fdf <- new("AnnotatedDataFrame", data=feature)
  110. exprs_dis <- as.matrix(prof_merge)
  111. adf_dis <- new("AnnotatedDataFrame", data=phen)
  112. experimentData <- new("MIAME",
  113. name="Hua", lab="Lab",
  114. title="tumor Experiment",
  115. abstract="The Mass Spectrometry ExpressionSet with overlap samples",
  116. url="www.zouhua.top",
  117. other=list(notes="The Intersect Proteins and Samples"))
  118. res <- new("ExpressionSet",
  119. exprs=exprs_dis,
  120. phenoData=adf_dis,
  121. featureData=fdf,
  122. experimentData=experimentData)
  123. return(res)
  124. }
  125. # Batch mean centering
  126. RemoveBatchExprSet_BMC <- function(datset=ExprSet){
  127. # datset=ExprSet
  128. pheno <- pData(datset)
  129. expr <- exprs(datset)
  130. feature <- fData(datset)
  131. # Remove batch effects
  132. phen <- pheno %>% arrange(Omics)
  133. prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  134. # CA19-9 should be removed batch effects
  135. prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  136. prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  137. prof_list <- list()
  138. batchlevels <- unique(phen$Omics)
  139. for(i in 1:length(batchlevels)){
  140. prof_list[[i]] <- scale(t(prof_NoCA199 %>%
  141. dplyr::select(rownames(phen[phen$Omics == batchlevels[i], ]))),
  142. center = TRUE, scale = FALSE)
  143. }
  144. prof_merge <- rbind(t(do.call(rbind, prof_list)), prof_CA199)
  145. if(!any(rownames(phen) == colnames(prof_merge))){
  146. stop("The Order of SampleID in DiscoverySet is wrong")
  147. }
  148. fdf <- new("AnnotatedDataFrame", data=feature)
  149. exprs_dis <- as.matrix(prof_merge)
  150. adf_dis <- new("AnnotatedDataFrame", data=phen)
  151. experimentData <- new("MIAME",
  152. name="Hua", lab="Lab",
  153. title="tumor Experiment",
  154. abstract="The Mass Spectrometry ExpressionSet with overlap samples",
  155. url="www.zouhua.top",
  156. other=list(notes="The Intersect Proteins and Samples"))
  157. res <- new("ExpressionSet",
  158. exprs=exprs_dis,
  159. phenoData=adf_dis,
  160. featureData=fdf,
  161. experimentData=experimentData)
  162. return(res)
  163. }
  164. # ComBat
  165. RemoveBatchExprSet_ComBat <- function(datset=ExprSet){
  166. # datset=ExprSet
  167. pheno <- pData(datset)
  168. expr <- exprs(datset)
  169. feature <- fData(datset)
  170. # design
  171. Design <- model.matrix(~SubGroup, data = pheno)
  172. # Remove batch effects
  173. phen <- pheno %>% arrange(Omics)
  174. prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
  175. # CA19-9 should be removed batch effects
  176. prof_CA199 <- prof[rownames(prof) == "CA199", , F]
  177. prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
  178. prof_combat <- sva::ComBat(prof_NoCA199, batch = phen$Omics, mod = Design, par.prior = F, prior.plots = F)
  179. prof_merge <- rbind(prof_combat, prof_CA199)
  180. if(!any(rownames(phen) == colnames(prof_merge))){
  181. stop("The Order of SampleID in DiscoverySet is wrong")
  182. }
  183. fdf <- new("AnnotatedDataFrame", data=feature)
  184. exprs_dis <- as.matrix(prof_merge)
  185. adf_dis <- new("AnnotatedDataFrame", data=phen)
  186. experimentData <- new("MIAME",
  187. name="Hua", lab="Lab",
  188. title="tumor Experiment",
  189. abstract="The Mass Spectrometry ExpressionSet with overlap samples",
  190. url="www.zouhua.top",
  191. other=list(notes="The Intersect Proteins and Samples"))
  192. res <- new("ExpressionSet",
  193. exprs=exprs_dis,
  194. phenoData=adf_dis,
  195. featureData=fdf,
  196. experimentData=experimentData)
  197. return(res)
  198. }

Run

  1. ExprSet_RBE <- RemoveBatchExprSet(datset=ExprSet)
  2. ExprSet_RBE_Design <- RemoveBatchExprSet_Design(datset=ExprSet)
  3. ExprSet_RBE_BMC <- RemoveBatchExprSet_BMC(datset=ExprSet)
  4. ExprSet_RBE_ComBat <- RemoveBatchExprSet_ComBat(datset=ExprSet)
  5. cowplot::plot_grid(#PCAFun(dataset=ExprSet),
  6. PCAFun(dataset=ExprSet_RBE),
  7. PCAFun(dataset=ExprSet_RBE_Design),
  8. PCAFun(dataset=ExprSet_RBE_BMC),
  9. PCAFun(dataset=ExprSet_RBE_ComBat),
  10. align = "hv",
  11. ncol = 2,
  12. labels = c(#"Origin",
  13. "Batch corrected",
  14. "Batch corrected with Design",
  15. "Batch corrected with BMC",
  16. "Batch corrected with ComBat"))

systemic information

  1. sessionInfo()
  1. R version 4.0.2 (2020-06-22)
  2. Platform: x86_64-conda_cos6-linux-gnu (64-bit)
  3. Running under: CentOS Linux 8 (Core)
  4. Matrix products: default
  5. BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
  6. locale:
  7. [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
  8. [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
  9. [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
  10. attached base packages:
  11. [1] parallel stats graphics grDevices utils datasets methods base
  12. other attached packages:
  13. [1] DEP_1.12.0 gg.gap_1.4 ggpubr_0.4.0 gtsummary_1.4.0 vegan_2.5-6
  14. [6] lattice_0.20-45 permute_0.9-5 factoextra_1.0.7 ggrepel_0.9.1.9999 sampling_2.8
  15. [11] data.table_1.14.2 convert_1.64.0 marray_1.68.0 limma_3.46.0 Biobase_2.50.0
  16. [16] BiocGenerics_0.36.0 ggplot2_3.3.5 tibble_3.1.6 dplyr_1.0.7
  17. loaded via a namespace (and not attached):
  18. [1] utf8_1.2.2 shinydashboard_0.7.1 reticulate_1.18
  19. [4] gmm_1.6-5 tidyselect_1.1.1 htmlwidgets_1.5.4
  20. [7] grid_4.0.2 BiocParallel_1.24.1 lpSolve_5.6.15
  21. [10] norm_1.0-9.5 munsell_0.5.0 codetools_0.2-18
  22. [13] preprocessCore_1.52.1 DT_0.20 umap_0.2.7.0
  23. [16] withr_2.4.3 colorspace_2.0-2 knitr_1.37
  24. [19] rstudioapi_0.13 stats4_4.0.2 robustbase_0.93-6
  25. [22] bayesm_3.1-4 ggsignif_0.6.0 mzID_1.28.0
  26. [25] MatrixGenerics_1.2.1 labeling_0.4.2 GenomeInfoDbData_1.2.4
  27. [28] farver_2.1.0 vctrs_0.3.8 generics_0.1.1
  28. [31] xfun_0.29 R6_2.5.1 doParallel_1.0.16
  29. [34] GenomeInfoDb_1.26.4 clue_0.3-60 bitops_1.0-7
  30. [37] DelayedArray_0.16.3 assertthat_0.2.1 promises_1.2.0.1
  31. [40] scales_1.1.1 gtable_0.3.0 Cairo_1.5-12.2
  32. [43] affy_1.68.0 sandwich_3.0-0 rlang_0.4.12
  33. [46] mzR_2.24.1 GlobalOptions_0.1.2 splines_4.0.2
  34. [49] rstatix_0.7.0 impute_1.64.0 broom_0.7.9
  35. [52] BiocManager_1.30.16 yaml_2.2.1 abind_1.4-5
  36. [55] crosstalk_1.2.0 backports_1.4.1 httpuv_1.6.4
  37. [58] tensorA_0.36.2 tools_4.0.2 affyio_1.60.0
  38. [61] ellipsis_0.3.2 jquerylib_0.1.4 RColorBrewer_1.1-2
  39. [64] MSnbase_2.16.1 Rcpp_1.0.7 plyr_1.8.6
  40. [67] zlibbioc_1.36.0 purrr_0.3.4 RCurl_1.98-1.3
  41. [70] openssl_1.4.6 GetoptLong_1.0.5 cowplot_1.1.0
  42. [73] S4Vectors_0.28.1 zoo_1.8-8 SummarizedExperiment_1.20.0
  43. [76] haven_2.3.1 cluster_2.1.0 tinytex_0.36
  44. [79] magrittr_2.0.1 RSpectra_0.16-0 openxlsx_4.2.3
  45. [82] circlize_0.4.10 pcaMethods_1.80.0 mvtnorm_1.1-1
  46. [85] ProtGenerics_1.22.0 matrixStats_0.61.0 xtable_1.8-4
  47. [88] hms_1.1.1 mime_0.12 evaluate_0.14
  48. [91] XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
  49. [94] IRanges_2.24.1 shape_1.4.5 compiler_4.0.2
  50. [97] gt_0.2.2 ncdf4_1.17 crayon_1.4.2
  51. [100] htmltools_0.5.2 mgcv_1.8-38 later_1.2.0
  52. [103] tzdb_0.2.0 tidyr_1.1.4 DBI_1.1.2
  53. [106] ComplexHeatmap_2.6.2 MASS_7.3-54 tmvtnorm_1.4-10
  54. [109] broom.helpers_1.3.0 compositions_2.0-2 Matrix_1.4-0
  55. [112] car_3.0-10 readr_2.1.1 cli_3.1.0
  56. [115] vsn_3.58.0 imputeLCMD_2.0 GenomicRanges_1.42.0
  57. [118] forcats_0.5.0 pkgconfig_2.0.3 foreign_0.8-81
  58. [121] MALDIquant_1.19.3 foreach_1.5.1 bslib_0.3.1
  59. [124] XVector_0.30.0 stringr_1.4.0 digest_0.6.29
  60. [127] rmarkdown_2.11 cellranger_1.1.0 curl_4.3.2
  61. [130] shiny_1.7.1 rjson_0.2.20 lifecycle_1.0.1
  62. [133] nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4
  63. [136] askpass_1.1 fansi_0.5.0 pillar_1.6.4
  64. [139] fastmap_1.1.0 DEoptimR_1.0-8 survival_3.2-13
  65. [142] glue_1.6.0 zip_2.1.1 png_0.1-7
  66. [145] iterators_1.0.13 stringi_1.4.6 sass_0.4.0

Reference

参考文章如引起任何侵权问题,可以与我联系,谢谢。