前言

上一节,我们通过对组间整体质谱数据分析后,发现NC和PC、PT的组间差异显著,那么具体到单个蛋白质水平的结果又是如何呢。更多知识分享请到 https://zouhua.top/

在单个蛋白质水平上做比较分析,我们可以使用limma包的函数进行分析。除了组间比较外,它还可以对某些可能潜在影响比较结果的因素如性别、年龄等进行校正处理(可以先期对组间临床表型进行组间差异比较,查看哪些指标是组间显著差异的,这些指标可能影响蛋白质的组间差异比较。它们应该作为协变量被校正掉).

除了使用基于线性模型的limma包外,还可以使用常用的t-test等,但默认情况下,我们会认为case和control组的数据都服从同一正态分布,具有相同的方差,这个时候我们会使用student-t test。可实际上,case和control组的分布可能不是同一个方差,我们这个时候应该选择Welch’s t-test。

导入数据

  1. library(dplyr)
  2. library(tibble)
  3. library(ggplot2)
  4. library(convert)
  5. library(limma)
  6. # rm(list = ls())
  7. options(stringsAsFactors = F)
  8. options(future.globals.maxSize = 1000 * 1024^2)
  9. subgrp <- c("NC", "PC", "PT")
  10. grp.col <- c("#568875", "#73FAFC", "#EE853D")
  11. ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")

Differential Expression Analysis

  1. 如果数据存在配对情况,则可设置pair=T。
  2. scale参数是针对counts数据设置的。
  3. future.apply是对并行计算设置的。
  4. eBayes的结果中logFC有时候让人很困惑,需要注意该结果是否的正确性,因此设置datCoe判断富集方向,也可以通过内置的pl图发现富集方向。
  1. get_DiffProtein_limma <- function(dataset=ExprSet_LOG,
  2. group_name=subgrp[1:2],
  3. pair=FALSE,
  4. scale=FALSE,
  5. fc=1,
  6. Pval=0.05){
  7. # dataset=ExprSet_LOG
  8. # group_name=subgrp[1:2]
  9. # pair=FALSE
  10. # scale=FALSE
  11. # fc=1
  12. # Pval=0.05
  13. pheno <- pData(dataset) %>%
  14. filter(SubGroup%in%group_name)
  15. pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
  16. pheno$PID <- factor(as.character(pheno$PID))
  17. if(pair){
  18. # paired test
  19. design <- model.matrix(~ pheno$SubGroup + pheno$PID)
  20. rownames(design) <- rownames(pheno)
  21. colnames(design) <- c("Intercept",
  22. paste(group_col, collapse = "-"),
  23. as.character(unique(pheno$PID)[-1]))
  24. }else{
  25. design <- model.matrix( ~ 0 + pheno$SubGroup)
  26. rownames(design) <- rownames(pheno)
  27. colnames(design) <- group_name
  28. }
  29. # show distribution
  30. edata <- as.matrix(exprs(dataset))
  31. exprSet <- edata[, colnames(edata)%in%rownames(pheno)]
  32. boxplot(exprSet)
  33. plotDensities(exprSet)
  34. # Normalization: TMM
  35. if(scale){
  36. require(edgeR)
  37. DGEList <- edgeR::DGEList(
  38. counts = exprSet,
  39. group = pheno$SubGroup)
  40. exprSet_norm <- edgeR::calcNormFactors(DGEList, method = "TMM")
  41. plotMDS(exprSet_norm, col=as.numeric(pheno$SubGroup))
  42. }else{
  43. exprSet_norm <- exprSet
  44. }
  45. # linear fitting
  46. #limma_voom <- voom(exprSet_norm, design, plot = TRUE)
  47. fit <- lmFit(exprSet_norm, design)
  48. if(pair){
  49. # eBayes
  50. fit2 <- eBayes(fit)
  51. qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
  52. abline(0,1)
  53. # differential features
  54. diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 2) %>%
  55. rownames_to_column("GeneID")
  56. # delta
  57. require(future.apply)
  58. plan(multiprocess, workers = 10)
  59. delta_value <- future_apply(exprSet, 1, function(x, y){
  60. # x = exprSet[1, ]
  61. # y = pheno
  62. dat <- data.frame(value=x, y) %>%
  63. arrange(PID, SubGroup) %>%
  64. dplyr::select(PID, SubGroup, value)
  65. dat$SubGroup <- factor(dat$SubGroup, levels = group_name)
  66. dat_delta <- dat %>% group_by(PID, SubGroup) %>%
  67. summarise(mean_value=mean(value)) %>% # mean or median???
  68. mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
  69. ungroup()
  70. delta <- mean(dat_delta$delta)
  71. return(delta)
  72. }, pheno) %>% data.frame() %>%
  73. setNames("Delta") %>%
  74. rownames_to_column("GeneID")
  75. # stopCluster(cl)
  76. # combine DEG and delta
  77. diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")
  78. }else{
  79. # contrast group for unpaired test
  80. group <- paste(group_name, collapse = "-")
  81. if(group%in%"NC-PC"){
  82. contrast <- makeContrasts(contrasts = "NC-PC",
  83. levels = design)
  84. }else if(group%in%"NC-PT"){
  85. contrast <- makeContrasts(contrasts = "NC-PT",
  86. levels = design)
  87. }else if(group%in%"PC-PT"){
  88. contrast <- makeContrasts(contrasts = "PC-PT",
  89. levels = design)
  90. }
  91. print(contrast)
  92. # eBayes
  93. fit2 <- contrasts.fit(fit, contrast)
  94. fit2 <- eBayes(fit2)
  95. qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
  96. abline(0,1)
  97. # differential features
  98. diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 1) %>%
  99. rownames_to_column("GeneID")
  100. # delta
  101. require(future.apply)
  102. plan(multiprocess, workers = 10)
  103. delta_value <- future_apply(exprSet, 1, function(x, y){
  104. # x = exprSet[1, ]
  105. # y = pheno
  106. dat <- data.frame(value=x, y) %>%
  107. arrange(SubGroup) %>%
  108. dplyr::select(SubGroup, value)
  109. dat$Type <- factor(dat$SubGroup, levels = group_name)
  110. dat_delta <- dat %>% group_by(SubGroup) %>%
  111. summarise(mean_value=mean(value)) %>% # mean or median???
  112. mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
  113. ungroup()
  114. delta <- mean(dat_delta$delta)
  115. return(delta)
  116. }, pheno) %>% data.frame() %>%
  117. setNames("Delta") %>%
  118. rownames_to_column("GeneID")
  119. # stopCluster(cl)
  120. # combine DEG and delta
  121. diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")
  122. }
  123. # validate the enriched directory
  124. pl <- data.frame(edata)[rownames(data.frame(edata))%in%diff_gene_delta$GeneID[1], , F] %>%
  125. t() %>% data.frame() %>%
  126. setNames("Gene") %>%
  127. rownames_to_column("SampleID") %>%
  128. inner_join(pheno%>%rownames_to_column("SampleID"), by = "SampleID") %>%
  129. ggplot(aes(x=SubGroup, y=Gene))+
  130. geom_boxplot()+
  131. labs(y=diff_gene$GeneID[1], x="")+
  132. ggpubr::stat_compare_means(method = "wilcox.test",
  133. paired = pair,
  134. comparisons = list(group_name))+
  135. theme_bw()
  136. print(pl)
  137. # enriched directory: It is sometimes useful to check things by hand to make sure you have the right interpretation.
  138. for(i in 1:5){
  139. datCoe <- fit$coefficients[diff_gene_delta$GeneID[i], ]
  140. deltaMean <- as.numeric(datCoe[group_name[2]] - datCoe[group_name[1]])
  141. logFC <- diff_gene_delta[diff_gene_delta$GeneID%in%diff_gene_delta$GeneID[i], "logFC"]
  142. cat(paste0(diff_gene_delta$GeneID[i], ": ", paste(rev(group_name), collapse = "-"), " = ", signif(deltaMean, 3)))
  143. cat("\n")
  144. cat(paste0(diff_gene_delta$GeneID[i], ": ", "logFC = ", signif(logFC, 3)))
  145. cat("\n")
  146. }
  147. if((deltaMean > 0 & logFC > 0) | (deltaMean < 0 & logFC < 0)){
  148. diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
  149. diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
  150. diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
  151. }else if((deltaMean > 0 & logFC < 0) | (deltaMean < 0 & logFC > 0)){
  152. diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
  153. diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
  154. diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
  155. }
  156. # Number & Block
  157. dat_status <- table(pheno$SubGroup)
  158. dat_status_number <- as.numeric(dat_status)
  159. dat_status_name <- names(dat_status)
  160. diff_gene_delta$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
  161. "vs",
  162. paste(dat_status_number[2], dat_status_name[2], sep = "_"))
  163. res <- diff_gene_delta %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
  164. arrange(adj.P.Val, logFC)
  165. print(dim(res %>% filter(Enrichment != "Nonsignif")))
  166. return(res)
  167. }
  168. NC_PC_LOG2Impute <- get_DiffProtein_limma(
  169. dataset = ExprSet_LOG2Impute,
  170. group_name = subgrp[1:2],
  171. pair = FALSE,
  172. scale = FALSE,
  173. fc = 1,
  174. Pval = 0.05)
  175. write.csv(NC_PC_LOG2Impute, "NC_PC_limma_Mass_LOG2Impute.csv", row.names = F)
  176. NC_PT_LOG2Impute <- get_DiffProtein_limma(
  177. dataset = ExprSet_LOG2Impute,
  178. group_name = subgrp[c(1,3)],
  179. pair = FALSE,
  180. scale = FALSE,
  181. fc = 1,
  182. Pval = 0.05)
  183. write.csv(NC_PT_LOG2Impute, "NC_PT_limma_Mass_LOG2Impute.csv", row.names = F)
  184. PC_PT_LOG2Impute <- get_DiffProtein_limma(
  185. dataset = ExprSet_LOG2Impute,
  186. group_name = subgrp[c(2, 3)],
  187. pair = FALSE,
  188. scale = FALSE,
  189. fc = 1,
  190. Pval = 0.05)
  191. write.csv(PC_PT_LOG2Impute, "PC_PT_limma_Mass_LOG2Impute.csv", row.names = F)

Welch’s t-test

The independent samples t-test comes in two different forms:
the standard Student’s t-test, which assumes that the variance of the two groups are equal. the Welch’s t-test, which is less restrictive compared to the original Student’s test. This is the test where you do not assume that the variance is the same in the two groups, which results in the fractional degrees of freedom.

Welch_ttest <- function(dataset=ExprSet_LOG2Impute,
                        group_name=subgrp[1:2],
                        Pval=0.05,
                        fc=1){

  # dataset=ExprSet_LOG2Impute
  # group_name=subgrp[1:2]
  # Pval=0.05
  # fc=1

  pheno <- pData(dataset) %>%
    filter(SubGroup%in%group_name) %>%
    mutate(SubGroup=factor(SubGroup, levels = group_name))
  edata <- exprs(dataset)[, rownames(pheno)]

  require(rstatix)  
  Welch_res <- apply(edata, 1, function(x, y){
    # x <- edata[1, ]
    # y <- pheno$Group
    dat <- data.frame(value=x, group=y)
    mdn <- tapply(dat$value, dat$group, median) %>% 
      data.frame() %>% setNames("value") %>%
      rownames_to_column("Group")
    mdn1 <- with(mdn, mdn[Group%in%group_name[1], "value"])
    mdn2 <- with(mdn, mdn[Group%in%group_name[2], "value"])
    Log2FC <- log2(mdn2/mdn1)
    rest <- t_test(data = dat, value ~ group)
    return(c(Log2FC, rest$statistic, rest$p))
  }, pheno$SubGroup) %>% 
    t() %>% data.frame() %>%
    setNames(c("logFC", "rho", "P.value"))

  res <- Welch_res[!is.nan(Welch_res$logFC), ] %>%
    rownames_to_column("GeneID")
  res$adj.P.Val <- p.adjust(as.numeric(res$P.value), method = "BH")
  # Number & Block
  dat_status <- table(pheno$SubGroup)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  res$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
                         "vs",
                         paste(dat_status_number[2], dat_status_name[2], sep = "_")) 
  # Enrichment
  res[which(res$logFC >= fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
  res[which(res$logFC <= -fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
  res[which(abs(res$logFC) < fc | res$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"

  res <- res %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
    arrange(adj.P.Val, logFC) 

  return(res)
}

NC_PC_LOG2Impute_WelchT <- Welch_ttest(
                        dataset    = ExprSet_LOG2Impute,
                        group_name = subgrp[1:2],
                        fc         = 1,
                        Pval       = 0.05)
write.csv(NC_PC_LOG2Impute_WelchT, "NC_PC_WelchT_Mass_LOG2Impute.csv", row.names = F)

NC_PT_LOG2Impute_WelchT <- Welch_ttest(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(1,3)],
                    fc         = 1,
                    Pval       = 0.05)
write.csv(NC_PT_LOG2Impute_WelchT, "NC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

PC_PT_LOG2Impute_WelchT <- Welch_ttest(
                    dataset    = ExprSet_LOG2Impute,
                    group_name = subgrp[c(2,3)],
                    fc         = 1,
                    Pval       = 0.05)
write.csv(PC_PT_LOG2Impute_WelchT, "PC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

systemic information

sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] rstatix_0.7.0       convert_1.64.0      marray_1.68.0       limma_3.46.0        Biobase_2.50.0      BiocGenerics_0.36.0
[7] ggplot2_3.3.3       tibble_3.1.0        dplyr_1.0.5        

loaded via a namespace (and not attached):
 [1] sass_0.3.1        edgeR_3.32.1      tidyr_1.1.3       jsonlite_1.7.2    carData_3.0-4     bslib_0.2.4      
 [7] assertthat_0.2.1  askpass_1.1       cellranger_1.1.0  yaml_2.2.1        pillar_1.6.0      backports_1.2.1  
[13] lattice_0.20-41   glue_1.4.2        reticulate_1.18   digest_0.6.27     ggsignif_0.6.0    colorspace_2.0-0 
[19] cowplot_1.1.0     htmltools_0.5.1.1 Matrix_1.3-2      plyr_1.8.6        pkgconfig_2.0.3   broom_0.7.6      
[25] haven_2.3.1       purrr_0.3.4       scales_1.1.1      RSpectra_0.16-0   openxlsx_4.2.3    rio_0.5.16       
[31] openssl_1.4.3     generics_0.1.0    farver_2.1.0      car_3.0-10        ellipsis_0.3.1    ggpubr_0.4.0     
[37] withr_2.4.1       umap_0.2.7.0      magrittr_2.0.1    crayon_1.4.1      readxl_1.3.1      evaluate_0.14    
[43] fansi_0.4.2       forcats_0.5.0     foreign_0.8-81    tools_4.0.2       data.table_1.14.0 hms_1.0.0        
[49] lifecycle_1.0.0   stringr_1.4.0     munsell_0.5.0     locfit_1.5-9.4    zip_2.1.1         jquerylib_0.1.3  
[55] compiler_4.0.2    tinytex_0.31      rlang_0.4.10      grid_4.0.2        labeling_0.4.2    rmarkdown_2.7    
[61] gtable_0.3.0      abind_1.4-5       DBI_1.1.1         curl_4.3          reshape2_1.4.4    R6_2.5.0         
[67] knitr_1.31        utf8_1.2.1        stringi_1.4.6     Rcpp_1.0.6        vctrs_0.3.7       tidyselect_1.1.0 
[73] xfun_0.20

Reference

  1. Proteomics Data Analysis (2/3): Data Filtering and Missing Value Imputation
  2. Welch’s t-test: When to Use it + Examples
  3. T-TEST ESSENTIALS: DEFINITION, FORMULA AND CALCULATION