数据分析：蛋白质质谱数据的差异检验

前言
导入数据
Differential Expression Analysis
Welch’s t-test
systemic information
Reference

前言

上一节，我们通过对组间整体质谱数据分析后，发现NC和PC、PT的组间差异显著，那么具体到单个蛋白质水平的结果又是如何呢。更多知识分享请到 https://zouhua.top/。

在单个蛋白质水平上做比较分析，我们可以使用limma包的函数进行分析。除了组间比较外，它还可以对某些可能潜在影响比较结果的因素如性别、年龄等进行校正处理(可以先期对组间临床表型进行组间差异比较，查看哪些指标是组间显著差异的，这些指标可能影响蛋白质的组间差异比较。它们应该作为协变量被校正掉).

除了使用基于线性模型的limma包外，还可以使用常用的t-test等，但默认情况下，我们会认为case和control组的数据都服从同一正态分布，具有相同的方差，这个时候我们会使用student-t test。可实际上，case和control组的分布可能不是同一个方差，我们这个时候应该选择Welch’s t-test。

导入数据

library(dplyr)
library(tibble)
library(ggplot2)
library(convert)
library(limma)
# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")
ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")

Differential Expression Analysis

如果数据存在配对情况，则可设置pair=T。
scale参数是针对counts数据设置的。
future.apply是对并行计算设置的。
eBayes的结果中logFC有时候让人很困惑，需要注意该结果是否的正确性，因此设置datCoe判断富集方向，也可以通过内置的pl图发现富集方向。

get_DiffProtein_limma <- function(dataset=ExprSet_LOG,
                               group_name=subgrp[1:2],
                               pair=FALSE,
                               scale=FALSE,
                               fc=1,
                               Pval=0.05){
  # dataset=ExprSet_LOG
  # group_name=subgrp[1:2]
  # pair=FALSE
  # scale=FALSE
  # fc=1
  # Pval=0.05
  pheno <- pData(dataset) %>%
    filter(SubGroup%in%group_name)
  pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
  pheno$PID <- factor(as.character(pheno$PID))
  if(pair){
    # paired test 
    design <- model.matrix(~ pheno$SubGroup + pheno$PID)
    rownames(design) <- rownames(pheno)
    colnames(design) <- c("Intercept",
                          paste(group_col, collapse = "-"), 
                          as.character(unique(pheno$PID)[-1]))    
  }else{
    design <- model.matrix( ~ 0 + pheno$SubGroup)
    rownames(design) <- rownames(pheno)
    colnames(design) <- group_name
  }
  # show distribution
  edata <- as.matrix(exprs(dataset))
  exprSet <- edata[, colnames(edata)%in%rownames(pheno)]  
  boxplot(exprSet)
  plotDensities(exprSet)
  # Normalization: TMM
  if(scale){
    require(edgeR)
    DGEList <- edgeR::DGEList(
                        counts = exprSet, 
                        group = pheno$SubGroup) 
    exprSet_norm <- edgeR::calcNormFactors(DGEList, method = "TMM")
    plotMDS(exprSet_norm, col=as.numeric(pheno$SubGroup))
  }else{
    exprSet_norm <- exprSet
  }
  # linear fitting
  #limma_voom <- voom(exprSet_norm, design, plot = TRUE)
  fit <- lmFit(exprSet_norm, design)
  if(pair){
    # eBayes
    fit2 <- eBayes(fit)
    qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
    abline(0,1)
    # differential features
    diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 2) %>%
      rownames_to_column("GeneID")
    # delta
    require(future.apply)
    plan(multiprocess, workers = 10)
    delta_value <- future_apply(exprSet, 1, function(x, y){
      # x = exprSet[1, ]
      # y = pheno
      dat <- data.frame(value=x, y) %>%
        arrange(PID, SubGroup) %>%
        dplyr::select(PID, SubGroup, value) 
      dat$SubGroup <- factor(dat$SubGroup, levels = group_name)
      dat_delta <- dat %>% group_by(PID, SubGroup) %>%
        summarise(mean_value=mean(value)) %>%   # mean or median???
        mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
        ungroup()
      delta <- mean(dat_delta$delta)
      return(delta)
    }, pheno) %>% data.frame() %>%
      setNames("Delta") %>%
      rownames_to_column("GeneID")
    # stopCluster(cl)
    # combine DEG and delta
    diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")    
  }else{
    # contrast group for unpaired test  
    group <- paste(group_name, collapse = "-")
    if(group%in%"NC-PC"){
      contrast <- makeContrasts(contrasts = "NC-PC",
                                levels    = design)
    }else if(group%in%"NC-PT"){
      contrast <- makeContrasts(contrasts = "NC-PT",
                                levels    = design)
    }else if(group%in%"PC-PT"){
      contrast <- makeContrasts(contrasts = "PC-PT",
                                levels    = design)
    }
    print(contrast)
    # eBayes
    fit2 <- contrasts.fit(fit, contrast)
    fit2 <- eBayes(fit2)
    qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
    abline(0,1)
    # differential features
    diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 1) %>%
      rownames_to_column("GeneID")  
    # delta
    require(future.apply)
    plan(multiprocess, workers = 10)
    delta_value <- future_apply(exprSet, 1, function(x, y){
      # x = exprSet[1, ]
      # y = pheno
      dat <- data.frame(value=x, y) %>%
        arrange(SubGroup) %>%
        dplyr::select(SubGroup, value) 
      dat$Type <- factor(dat$SubGroup, levels = group_name)
      dat_delta <- dat %>% group_by(SubGroup) %>%
        summarise(mean_value=mean(value)) %>% # mean or median???
        mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
        ungroup() 
      delta <- mean(dat_delta$delta)
      return(delta)
    }, pheno) %>% data.frame() %>%
      setNames("Delta") %>%
      rownames_to_column("GeneID")
    # stopCluster(cl)
    # combine DEG and delta
    diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")     
  }
  # validate the enriched directory
  pl <- data.frame(edata)[rownames(data.frame(edata))%in%diff_gene_delta$GeneID[1], , F] %>% 
    t() %>% data.frame() %>%
    setNames("Gene") %>%
    rownames_to_column("SampleID") %>%
    inner_join(pheno%>%rownames_to_column("SampleID"), by = "SampleID") %>%
  ggplot(aes(x=SubGroup, y=Gene))+
    geom_boxplot()+
    labs(y=diff_gene$GeneID[1], x="")+
    ggpubr::stat_compare_means(method = "wilcox.test",
                               paired = pair,
                               comparisons = list(group_name))+
    theme_bw()
  print(pl)
  # enriched directory: It is sometimes useful to check things by hand to make sure you have the right interpretation.
  for(i in 1:5){
    datCoe <- fit$coefficients[diff_gene_delta$GeneID[i], ]
    deltaMean <- as.numeric(datCoe[group_name[2]] - datCoe[group_name[1]])
    logFC <- diff_gene_delta[diff_gene_delta$GeneID%in%diff_gene_delta$GeneID[i], "logFC"]
    cat(paste0(diff_gene_delta$GeneID[i], ": ", paste(rev(group_name), collapse = "-"), " = ", signif(deltaMean, 3)))
    cat("\n")
    cat(paste0(diff_gene_delta$GeneID[i], ": ", "logFC = ", signif(logFC, 3))) 
    cat("\n")
  }
  if((deltaMean > 0 & logFC > 0) | (deltaMean < 0 & logFC < 0)){
    diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
    diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
    diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"     
  }else if((deltaMean > 0 & logFC < 0) | (deltaMean < 0 & logFC > 0)){
    diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
    diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
    diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"     
  }
  # Number & Block
  dat_status <- table(pheno$SubGroup)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  diff_gene_delta$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
                         "vs",
                         paste(dat_status_number[2], dat_status_name[2], sep = "_"))
  res <- diff_gene_delta %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
    arrange(adj.P.Val, logFC) 
  print(dim(res %>% filter(Enrichment != "Nonsignif")))
  return(res)
}
NC_PC_LOG2Impute <- get_DiffProtein_limma(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[1:2],
                    pair      = FALSE,
                    scale     = FALSE,
                    fc        = 1,
                    Pval      = 0.05)
write.csv(NC_PC_LOG2Impute, "NC_PC_limma_Mass_LOG2Impute.csv", row.names = F)
NC_PT_LOG2Impute <- get_DiffProtein_limma(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(1,3)],
                    pair      = FALSE,
                    scale     = FALSE,                    
                    fc        = 1,
                    Pval      = 0.05)
write.csv(NC_PT_LOG2Impute, "NC_PT_limma_Mass_LOG2Impute.csv", row.names = F)
PC_PT_LOG2Impute <- get_DiffProtein_limma(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(2, 3)],
                    pair      = FALSE,
                    scale     = FALSE,                    
                    fc        = 1,
                    Pval      = 0.05)
write.csv(PC_PT_LOG2Impute, "PC_PT_limma_Mass_LOG2Impute.csv", row.names = F)

Welch’s t-test

The independent samples t-test comes in two different forms:
the standard Student’s t-test, which assumes that the variance of the two groups are equal. the Welch’s t-test, which is less restrictive compared to the original Student’s test. This is the test where you do not assume that the variance is the same in the two groups, which results in the fractional degrees of freedom.

Welch_ttest <- function(dataset=ExprSet_LOG2Impute,
                        group_name=subgrp[1:2],
                        Pval=0.05,
                        fc=1){

  # dataset=ExprSet_LOG2Impute
  # group_name=subgrp[1:2]
  # Pval=0.05
  # fc=1

  pheno <- pData(dataset) %>%
    filter(SubGroup%in%group_name) %>%
    mutate(SubGroup=factor(SubGroup, levels = group_name))
  edata <- exprs(dataset)[, rownames(pheno)]

  require(rstatix)  
  Welch_res <- apply(edata, 1, function(x, y){
    # x <- edata[1, ]
    # y <- pheno$Group
    dat <- data.frame(value=x, group=y)
    mdn <- tapply(dat$value, dat$group, median) %>% 
      data.frame() %>% setNames("value") %>%
      rownames_to_column("Group")
    mdn1 <- with(mdn, mdn[Group%in%group_name[1], "value"])
    mdn2 <- with(mdn, mdn[Group%in%group_name[2], "value"])
    Log2FC <- log2(mdn2/mdn1)
    rest <- t_test(data = dat, value ~ group)
    return(c(Log2FC, rest$statistic, rest$p))
  }, pheno$SubGroup) %>% 
    t() %>% data.frame() %>%
    setNames(c("logFC", "rho", "P.value"))

  res <- Welch_res[!is.nan(Welch_res$logFC), ] %>%
    rownames_to_column("GeneID")
  res$adj.P.Val <- p.adjust(as.numeric(res$P.value), method = "BH")
  # Number & Block
  dat_status <- table(pheno$SubGroup)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  res$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
                         "vs",
                         paste(dat_status_number[2], dat_status_name[2], sep = "_")) 
  # Enrichment
  res[which(res$logFC >= fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
  res[which(res$logFC <= -fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
  res[which(abs(res$logFC) < fc | res$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"

  res <- res %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
    arrange(adj.P.Val, logFC) 

  return(res)
}

NC_PC_LOG2Impute_WelchT <- Welch_ttest(
                        dataset    = ExprSet_LOG2Impute,
                        group_name = subgrp[1:2],
                        fc         = 1,
                        Pval       = 0.05)
write.csv(NC_PC_LOG2Impute_WelchT, "NC_PC_WelchT_Mass_LOG2Impute.csv", row.names = F)

NC_PT_LOG2Impute_WelchT <- Welch_ttest(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(1,3)],
                    fc         = 1,
                    Pval       = 0.05)
write.csv(NC_PT_LOG2Impute_WelchT, "NC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

PC_PT_LOG2Impute_WelchT <- Welch_ttest(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(2,3)],
                    fc         = 1,
                    Pval       = 0.05)
write.csv(PC_PT_LOG2Impute_WelchT, "PC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

systemic information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] rstatix_0.7.0       convert_1.64.0      marray_1.68.0       limma_3.46.0        Biobase_2.50.0      BiocGenerics_0.36.0
[7] ggplot2_3.3.3       tibble_3.1.0        dplyr_1.0.5        

loaded via a namespace (and not attached):
 [1] sass_0.3.1        edgeR_3.32.1      tidyr_1.1.3       jsonlite_1.7.2    carData_3.0-4     bslib_0.2.4      
 [7] assertthat_0.2.1  askpass_1.1       cellranger_1.1.0  yaml_2.2.1        pillar_1.6.0      backports_1.2.1  
[13] lattice_0.20-41   glue_1.4.2        reticulate_1.18   digest_0.6.27     ggsignif_0.6.0    colorspace_2.0-0 
[19] cowplot_1.1.0     htmltools_0.5.1.1 Matrix_1.3-2      plyr_1.8.6        pkgconfig_2.0.3   broom_0.7.6      
[25] haven_2.3.1       purrr_0.3.4       scales_1.1.1      RSpectra_0.16-0   openxlsx_4.2.3    rio_0.5.16       
[31] openssl_1.4.3     generics_0.1.0    farver_2.1.0      car_3.0-10        ellipsis_0.3.1    ggpubr_0.4.0     
[37] withr_2.4.1       umap_0.2.7.0      magrittr_2.0.1    crayon_1.4.1      readxl_1.3.1      evaluate_0.14    
[43] fansi_0.4.2       forcats_0.5.0     foreign_0.8-81    tools_4.0.2       data.table_1.14.0 hms_1.0.0        
[49] lifecycle_1.0.0   stringr_1.4.0     munsell_0.5.0     locfit_1.5-9.4    zip_2.1.1         jquerylib_0.1.3  
[55] compiler_4.0.2    tinytex_0.31      rlang_0.4.10      grid_4.0.2        labeling_0.4.2    rmarkdown_2.7    
[61] gtable_0.3.0      abind_1.4-5       DBI_1.1.1         curl_4.3          reshape2_1.4.4    R6_2.5.0         
[67] knitr_1.31        utf8_1.2.1        stringi_1.4.6     Rcpp_1.0.6        vctrs_0.3.7       tidyselect_1.1.0 
[73] xfun_0.20

前言

导入数据

Differential Expression Analysis

Welch’s t-test

systemic information

Reference