前言

随着大数据时代的到来,我们常常需要对高纬度数据进行降维处理后再对数据整体结构进行可视化,进而发现数据内在的区别,为后续数据分析提供初步的参考。更多知识分享请到 https://zouhua.top/

这里主要介绍三类数据分析方法的R实现过程:

1.PCA主成分分析;

2.tSNE,基于概率密度分布计算能保留更多组间和个体差异的方法;

3.UMAP,非线性降维方法。

导入数据

  1. options(warn = 0)
  2. library(dplyr)
  3. library(tibble)
  4. library(ggplot2)
  5. options(warn = 0)
  6. ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
  7. subgrp <- c("NC", "PC", "PT")
  8. grp.col <- c("#568875", "#73FAFC", "#EE853D")

Principal Component Analysis

  1. PCAFun <- function(dataset = ExprSet_LOG2Impute ){
  2. # dataset = ExprSet_LOG2Impute
  3. require(convert)
  4. metadata <- pData(dataset)
  5. profile <- exprs(dataset)
  6. # pca
  7. pca <- prcomp(scale(t(profile), center = T, scale = T))
  8. require(factoextra)
  9. eig <- get_eig(pca)
  10. # explains variable
  11. explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
  12. # principal component score of each sample
  13. score <- inner_join(pca$x %>% data.frame() %>%
  14. dplyr::select(c(1:2)) %>%
  15. rownames_to_column("SampleID"),
  16. metadata %>% rownames_to_column("SampleID"),
  17. by = "SampleID") %>%
  18. mutate(Group=factor(Group, levels = grp))
  19. # PERMANOVA
  20. require(vegan)
  21. set.seed(123)
  22. if(any(profile < 0)){
  23. res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999)
  24. }else{
  25. res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)
  26. }
  27. adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  28. adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  29. #use the bquote function to format adonis results to be annotated on the ordination plot.
  30. signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  31. adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
  32. atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))
  33. pl <- ggplot(score, aes(x=PC1, y=PC2))+
  34. geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
  35. stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
  36. labs(x=explains[1], y=explains[2])+
  37. scale_color_manual(values = grp.col)+
  38. scale_fill_manual(name = "Condition",
  39. values = grp.col)+
  40. annotate("text", x = max(score$PC1) - 8,
  41. y = min(score$PC1),
  42. label = adn_res_format,
  43. size = 6)+
  44. guides(color=F)+
  45. theme_classic()+
  46. theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
  47. axis.text = element_text(size = 9, color = "black"),
  48. text = element_text(size = 8, color = "black", family = "serif"),
  49. strip.text = element_text(size = 9, color = "black", face = "bold"),
  50. panel.grid = element_blank(),
  51. legend.title = element_text(size = 11, color = "black", family = "serif"),
  52. legend.text = element_text(size = 10, color = "black", family = "serif"),
  53. legend.position = c(0, 0),
  54. legend.justification = c(0, 0),
  55. legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  56. return(pl)
  57. }
  58. PCA_LOG2Impute <- PCAFun(dataset = ExprSet_LOG2Impute)
  59. PCA_LOG2Impute
  60. ggsave("Mass_ProteinsLOG2Impute_PCA.pdf", PCA_LOG2Impute, width = 5, height = 5, dpi = 300)

image.png

Rtsne

RtsneFun <- function(dataset = ExprSet_LOG2Impute,
                     perpl = 10){

  # dataset = ExprSet_LOG2Impute
  # perpl = 10

  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))

  # Rtsne
  require(Rtsne)
  #set.seed(123)
  Rtsne <- Rtsne(profile, 
                 dims=2, 
                 perplexity=perpl,
                 verbose=TRUE, 
                 max_iter=500, 
                 eta=200)
  point <- Rtsne$Y %>% data.frame() %>% 
    dplyr::select(c(1:2)) %>%
    setNames(c("tSNE1", "tSNE2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))

  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 

  pl <- ggplot(score, aes(x=tSNE1, y=tSNE2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$tSNE1) - 8,
                       y = max(score$tSNE2)-5,
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

Rtsne_LOG2Impute <- RtsneFun(dataset = ExprSet_LOG2Impute, perpl = 10)
Rtsne_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_Rtsne.pdf", Rtsne_LOG2Impute, width = 5, height = 5, dpi = 300)

image.png

UMAP: a non-linear dimensionality reduction algorithm

UMAPFun <- function(dataset = ExprSet_LOG2Impute){

  # dataset = ExprSet_LOG2Impute

  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))

  # umap 
  require(umap)
  umap <- umap::umap(profile)

  point <- umap$layout %>% data.frame() %>%
    setNames(c("UMAP1", "UMAP2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))

  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))   

  pl <- ggplot(score, aes(x=UMAP1, y=UMAP2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$UMAP1),
                       y = min(score$UMAP2),
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

UMAP_LOG2Impute <- UMAPFun(dataset = ExprSet_LOG2Impute)
UMAP_LOG2Impute 
ggsave("Mass_ProteinsLOG2Impute_UMAP.pdf", UMAP_LOG2Impute, width = 5, height = 5, dpi = 300)

image.png

Notes: 三组数据的整体性差异如果较为明显,一般可以在不同的降维方法上都有所体现。从分组看,NC组和PC、PT组均存在明显的差异,后续数据分析应该着重关注 NC vs PC, NC vs PT,但也可以对PC vs PT做比较分析。

Systemic information

sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] umap_0.2.7.0                                        Rtsne_0.15                                         
 [3] vegan_2.5-6                                         lattice_0.20-41                                    
 [5] permute_0.9-5                                       factoextra_1.0.7                                   
 [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 IlluminaHumanMethylationEPICmanifest_0.3.0         
 [9] minfi_1.36.0                                        bumphunter_1.32.0                                  
[11] locfit_1.5-9.4                                      iterators_1.0.13                                   
[13] foreach_1.5.1                                       Biostrings_2.58.0                                  
[15] XVector_0.30.0                                      SummarizedExperiment_1.20.0                        
[17] MatrixGenerics_1.2.1                                matrixStats_0.58.0                                 
[19] GenomicRanges_1.42.0                                GenomeInfoDb_1.26.4                                
[21] IRanges_2.24.1                                      S4Vectors_0.28.1                                   
[23] eulerr_6.1.0                                        UpSetR_1.4.0                                       
[25] ggplot2_3.3.3                                       DEP_1.12.0                                         
[27] convert_1.64.0                                      marray_1.68.0                                      
[29] limma_3.46.0                                        Biobase_2.50.0                                     
[31] BiocGenerics_0.36.0                                 data.table_1.14.0                                  
[33] tibble_3.1.0                                        dplyr_1.0.5                                        

loaded via a namespace (and not attached):
  [1] utf8_1.2.1                shinydashboard_0.7.1      reticulate_1.18           gmm_1.6-5                
  [5] tidyselect_1.1.0          RSQLite_2.2.5             AnnotationDbi_1.52.0      htmlwidgets_1.5.3        
  [9] grid_4.0.2                BiocParallel_1.24.1       norm_1.0-9.5              munsell_0.5.0            
 [13] codetools_0.2-18          preprocessCore_1.52.1     DT_0.18                   withr_2.4.1              
 [17] colorspace_2.0-0          knitr_1.31                rstudioapi_0.13           mzID_1.28.0              
 [21] labeling_0.4.2            GenomeInfoDbData_1.2.4    farver_2.1.0              bit64_4.0.5              
 [25] rhdf5_2.34.0              vctrs_0.3.7               generics_0.1.0            xfun_0.20                
 [29] BiocFileCache_1.14.0      R6_2.5.0                  doParallel_1.0.16         illuminaio_0.32.0        
 [33] clue_0.3-58               bitops_1.0-6              rhdf5filters_1.2.0        cachem_1.0.4             
 [37] reshape_0.8.8             DelayedArray_0.16.3       assertthat_0.2.1          promises_1.2.0.1         
 [41] scales_1.1.1              gtable_0.3.0              Cairo_1.5-12.2            affy_1.68.0              
 [45] sandwich_3.0-0            rlang_0.4.10              genefilter_1.72.0         mzR_2.24.1               
 [49] GlobalOptions_0.1.2       splines_4.0.2             rtracklayer_1.50.0        GEOquery_2.58.0          
 [53] impute_1.64.0             yaml_2.2.1                reshape2_1.4.4            BiocManager_1.30.12      
 [57] GenomicFeatures_1.42.2    httpuv_1.5.5              tools_4.0.2               nor1mix_1.3-0            
 [61] affyio_1.60.0             ellipsis_0.3.1            jquerylib_0.1.3           RColorBrewer_1.1-2       
 [65] siggenes_1.64.0           MSnbase_2.16.1            Rcpp_1.0.6                plyr_1.8.6               
 [69] sparseMatrixStats_1.2.1   progress_1.2.2            zlibbioc_1.36.0           purrr_0.3.4              
 [73] RCurl_1.98-1.3            prettyunits_1.1.1         openssl_1.4.3             GetoptLong_1.0.5         
 [77] cowplot_1.1.0             zoo_1.8-8                 ggrepel_0.9.1.9999        cluster_2.1.0            
 [81] tinytex_0.31              magrittr_2.0.1            RSpectra_0.16-0           circlize_0.4.10          
 [85] pcaMethods_1.80.0         mvtnorm_1.1-1             ProtGenerics_1.22.0       evaluate_0.14            
 [89] hms_1.0.0                 mime_0.10                 xtable_1.8-4              XML_3.99-0.6             
 [93] mclust_5.4.7              gridExtra_2.3             shape_1.4.5               compiler_4.0.2           
 [97] biomaRt_2.46.3            ncdf4_1.17                crayon_1.4.1              htmltools_0.5.1.1        
[101] mgcv_1.8-34               later_1.1.0.1             tidyr_1.1.3               DBI_1.1.1                
[105] dbplyr_2.1.1              ComplexHeatmap_2.6.2      MASS_7.3-53.1             tmvtnorm_1.4-10          
[109] rappdirs_0.3.3            Matrix_1.3-2              readr_1.4.0               cli_2.4.0                
[113] vsn_3.58.0                imputeLCMD_2.0            quadprog_1.5-8            pkgconfig_2.0.3          
[117] GenomicAlignments_1.26.0  MALDIquant_1.19.3         xml2_1.3.2                annotate_1.68.0          
[121] bslib_0.2.4               rngtools_1.5              multtest_2.46.0           beanplot_1.2             
[125] doRNG_1.8.2               scrime_1.3.5              stringr_1.4.0             digest_0.6.27            
[129] rmarkdown_2.7             base64_2.0                DelayedMatrixStats_1.12.3 curl_4.3                 
[133] shiny_1.6.0               Rsamtools_2.6.0           rjson_0.2.20              jsonlite_1.7.2           
[137] nlme_3.1-150              lifecycle_1.0.0           Rhdf5lib_1.12.1           askpass_1.1              
[141] fansi_0.4.2               pillar_1.6.0              fastmap_1.1.0             httr_1.4.2               
[145] survival_3.2-10           glue_1.4.2                png_0.1-7                 bit_4.0.4                
[149] sass_0.3.1                stringi_1.4.6             HDF5Array_1.18.1          blob_1.2.1               
[153] memoise_2.0.0

Reference

  1. How to change Legend of ggplot2
  2. How to change ggplot facet labels