数据标准化—异常样本于重复性检测—差异表达分析

一、数据标准化

  1. rm(list = ls())
  2. options(stringsAsFactors = F)
  3. library(stringr)
  4. ## ====================1.读取数据
  5. # 读取raw count表达矩阵
  6. rawcount <- read.table("data/raw_counts.txt",row.names = 1,
  7. sep = "\t", header = T)
  8. colnames(rawcount)
  9. # 查看表达谱
  10. rawcount[1:4,1:4]
  11. # 去除前的基因表达矩阵情况
  12. dim(rawcount)
  13. # 获取分组信息
  14. group <- read.table("data/filereport_read_run_PRJNA229998_tsv.txt",
  15. header = T,sep = "\t", quote = "\"")
  16. colnames(group)
  17. # 提取表达矩阵对应的样本表型信息
  18. group <- group[match(colnames(rawcount), group$run_accession),
  19. c("run_accession","sample_title")]
  20. group
  21. # 差异分析方案为:Dex vs untreated
  22. group$sample_title <- str_split_fixed(group$sample_title,pattern = "_", n=2)[,2]
  23. group
  24. write.table(group,file = "data/group.txt",row.names = F,sep = "\t",quote = F)
  25. ## =================== 2.表达矩阵预处理
  26. # 过滤低表达基因
  27. keep <- rowSums(rawcount>0) >= floor(0.75*ncol(rawcount))
  28. table(keep)
  29. filter_count <- rawcount[keep,]
  30. filter_count[1:4,1:4]
  31. dim(filter_count)
  32. # 加载edgeR包计算counts per millio(cpm) 表达矩阵
  33. library(edgeR)
  34. express_cpm <- cpm(filter_count)
  35. express_cpm[1:6,1:6]
  36. # 保存表达矩阵和分组结果
  37. save(filter_count, express_cpm, group, file = "data/Step01-airwayData.Rdata")

二、异常样本与重复性检测

  1. rm(list = ls())
  2. options(stringsAsFactors = F)
  3. # 加载包,设置绘图参数
  4. library(ggplot2)
  5. library(ggsci)
  6. mythe <- theme_bw() + theme(panel.grid.major=element_blank(),
  7. panel.grid.minor=element_blank())
  8. # 加载原始表达的数据
  9. lname <- load(file = "data/Step01-airwayData.Rdata")
  10. lname
  11. exprSet <- log10(as.matrix(express_cpm)+1)
  12. exprSet[1:6,1:6]
  13. ## 1.样本表达总体分布-箱式图
  14. # 构造绘图数据
  15. data <- data.frame(expression=c(exprSet),
  16. sample=rep(colnames(exprSet),each=nrow(exprSet)))
  17. head(data)
  18. p <- ggplot(data = data, aes(x=sample,y=expression,fill=sample))
  19. p1 <- p + geom_boxplot() +
  20. mythe+ theme(axis.text.x = element_text(angle = 90)) +
  21. xlab(NULL) + ylab("log10(CPM+1)") + scale_fill_lancet()
  22. p1
  23. # 保存图片
  24. png(file = "result/1.sample_boxplot.png",width = 800, height = 900,res=150)
  25. print(p1)
  26. dev.off()
  27. ## 2.样本表达总体分布-小提琴图
  28. p2 <- p + geom_violin() + mythe +
  29. theme(axis.text = element_text(size = 12),
  30. axis.text.x = element_text(angle = 90)) +
  31. xlab(NULL) + ylab("log10(CPM+1)")+scale_fill_lancet()
  32. p2
  33. # 保存图片
  34. png(file = "result/1.sample_violin.png",width = 800, height = 900,res=150)
  35. print(p2)
  36. dev.off()
  37. ## 3.样本表达总体分布-概率密度分布图
  38. m <- ggplot(data=data, aes(x=expression))
  39. p3 <- m + geom_density(aes(fill=sample, colour=sample),alpha = 0.1) +
  40. xlab("log10(CPM+1)") + mythe +scale_fill_lancet()
  41. p3
  42. # 保存图片
  43. png(file = "result/1.sample_density.png",width = 800, height = 700, res=150)
  44. print(p3)
  45. dev.off()
  46. # 魔幻操作,一键清空
  47. rm(list = ls())
  48. options(stringsAsFactors = F)
  49. library(FactoMineR)
  50. library(factoextra)
  51. library(corrplot)
  52. library(pheatmap)
  53. library(tidyverse)
  54. # 加载数据并检查
  55. lname <- load(file = 'data/Step01-airwayData.Rdata')
  56. lname
  57. ## 1.样本之间的相关性-层次聚类树----
  58. dat <- log10(express_cpm+1)
  59. dat[1:4,1:4]
  60. dim(dat)
  61. sampleTree <- hclust(dist(t(dat)), method = "average")
  62. plot(sampleTree)
  63. # 提取样本聚类信息
  64. temp <- as.data.frame(cutree(sampleTree,k = 2)) %>%
  65. rownames_to_column(var="sample")
  66. temp1 <- merge(temp,group,by.x = "sample",by.y="run_accession")
  67. table(temp1$`cutree(sampleTree, k = 2)`,temp1$sample_title)
  68. # 保存结果
  69. pdf(file = "result/2.sample_Treeplot.pdf",width = 7,height = 6)
  70. plot(sampleTree)
  71. dev.off()
  72. ## 2.样本之间的相关性-PCA----
  73. # 第一步,数据预处理
  74. dat <- log10(express_cpm+1)
  75. dat[1:4,1:4]
  76. dat <- as.data.frame(t(dat))
  77. dat_pca <- PCA(dat, graph = FALSE)
  78. group_list <- group[match(group$run_accession,rownames(dat)), 2]
  79. group_list
  80. # geom.ind: point显示点,text显示文字
  81. # palette: 用不同颜色表示分组
  82. # addEllipses: 是否圈起来
  83. mythe <- theme_bw() +
  84. theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank()) +
  85. theme(plot.title = element_text(hjust = 0.5))
  86. p <- fviz_pca_ind(dat_pca,
  87. geom.ind = "text", #point
  88. col.ind = group_list,
  89. palette = c("#00AFBB", "#E7B800"),
  90. addEllipses = T,
  91. legend.title = "Groups") + mythe
  92. p
  93. # 保存结果
  94. pdf(file = "result/2.sample_PCA.pdf",width = 6.5,height = 6)
  95. plot(p)
  96. dev.off()
  97. ## 3.样本之间的相关性-cor----
  98. # 选择差异变化大的基因算样本相关性
  99. exprSet <- express_cpm
  100. exprSet = exprSet[names(sort(apply(exprSet, 1, mad),decreasing = T)[1:800]),]
  101. dim(exprSet)
  102. # 计算相关性
  103. M <- cor(exprSet,method = "spearman")
  104. M
  105. # 构造注释条
  106. anno <- data.frame(group=group$sample_title,row.names = group$run_accession )
  107. # 保存结果
  108. pheatmap(M,display_numbers = T,
  109. annotation_col = anno,
  110. fontsize = 10,cellheight = 30,
  111. cellwidth = 30,cluster_rows = T,
  112. cluster_cols = T,
  113. filename = "result/2.sample_Cor.pdf",width = 7.5,height = 7)

三、差异表达分析

  1. rm(list = ls())
  2. options(stringsAsFactors = F)
  3. # 加载包
  4. library(edgeR)
  5. library(ggplot2)
  6. # 读取基因表达矩阵信息并查看分组信息和表达矩阵数据
  7. lname <- load(file = "data/Step01-airwayData.Rdata")
  8. lname
  9. # 表达谱
  10. filter_count[1:4,1:4]
  11. # 分组信息
  12. group_list <- group[match(colnames(filter_count),group$run_accession),2]
  13. group_list
  14. # treat vs control
  15. comp <- unlist(strsplit("Dex_vs_untreated",split = "_vs_"))
  16. group_list <- factor(group_list,levels = comp)
  17. group_list
  18. table(group_list)
  19. # 构建线性模型。0代表x线性模型的截距为0
  20. design <- model.matrix(~0+group_list)
  21. rownames(design) <- colnames(filter_count)
  22. colnames(design) <- levels(factor(group_list))
  23. design
  24. # 构建edgeR的DGEList对象
  25. DEG <- DGEList(counts=filter_count,
  26. group=factor(group_list))
  27. # 归一化基因表达分布
  28. DEG <- calcNormFactors(DEG)
  29. # 计算线性模型的参数
  30. DEG <- estimateGLMCommonDisp(DEG,design)
  31. DEG <- estimateGLMTrendedDisp(DEG, design)
  32. DEG <- estimateGLMTagwiseDisp(DEG, design)
  33. # 拟合线性模型
  34. fit <- glmFit(DEG, design)
  35. # 进行差异分析
  36. lrt <- glmLRT(fit, contrast=c(1,-1))
  37. # 提取过滤差异分析结果
  38. DEG_edgeR <- as.data.frame(topTags(lrt, n=nrow(DEG)))
  39. head(DEG_edgeR)
  40. # 筛选上下调,设定阈值
  41. fc_cutoff <- 1.5
  42. pvalue <- 0.05
  43. DEG_edgeR$regulated <- "normal"
  44. loc_up <- intersect(which( DEG_edgeR$logFC > log2(fc_cutoff) ),
  45. which( DEG_edgeR$PValue < pvalue) )
  46. loc_down <- intersect(which(DEG_edgeR$logFC < (-log2(fc_cutoff))),
  47. which(DEG_edgeR$PValue<pvalue))
  48. DEG_edgeR$regulated[loc_up] <- "up"
  49. DEG_edgeR$regulated[loc_down] <- "down"
  50. table(DEG_edgeR$regulated)
  51. ## 添加一列gene symbol
  52. # 方法1:使用包
  53. library(org.Hs.eg.db)
  54. keytypes(org.Hs.eg.db)
  55. library(clusterProfiler)
  56. id2symbol <- bitr(rownames(DEG_edgeR),
  57. fromType = "ENSEMBL",
  58. toType = "SYMBOL",
  59. OrgDb = org.Hs.eg.db)
  60. head(id2symbol)
  61. DEG_edgeR <- cbind(GeneID=rownames(DEG_edgeR),DEG_edgeR)
  62. DEG_edgeR_symbol <- merge(id2symbol,DEG_edgeR,
  63. by.x="ENSEMBL",by.y="GeneID",all.y=T)
  64. head(DEG_edgeR_symbol)
  65. # 方法2:gtf文件中得到的id与name关系
  66. # Assembly: GRCh37(hg19) Release: ?
  67. # 使用上课测试得到的count做
  68. # 选择显著差异表达的结果
  69. library(tidyverse)
  70. DEG_edgeR_symbol_Sig <- filter(DEG_edgeR_symbol,regulated!="normal")
  71. # 保存
  72. write.csv(DEG_edgeR_symbol,"result/4.DEG_edgeR_all.csv", row.names = F)
  73. write.csv(DEG_edgeR_symbol_Sig,"result/4.DEG_edgeR_Sig.csv", row.names = F)
  74. save(DEG_edgeR_symbol,file = "data/Step03-edgeR_nrDEG.Rdata")
  75. ##====== 检查是否上下调设置错了
  76. # 挑选一个差异表达基因
  77. head(DEG_edgeR_symbol_Sig)
  78. exp <- c(t(express_cpm[match("ENSG00000001626",rownames(express_cpm)),]))
  79. test <- data.frame(value=exp, group=group_list)
  80. ggplot(data=test,aes(x=group,y=value,fill=group)) + geom_boxplot()
  81. ##可视化
  82. rm(list = ls())
  83. options(stringsAsFactors = F)
  84. library(ggplot2)
  85. library(tidyverse)
  86. # 读差异分析结果
  87. lname <- load(file = "data//Step03-edgeR_nrDEG.Rdata")
  88. # 根据需要修改DEG的值
  89. data <- DEG_edgeR_symbol
  90. colnames(data)
  91. # 绘制火山图
  92. colnames(data)
  93. p <- ggplot(data=data, aes(x=logFC, y=-log10(PValue),color=regulated)) +
  94. geom_point(alpha=0.5, size=1.2) +
  95. theme_set(theme_set(theme_bw(base_size=20))) + theme_bw() +
  96. theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank()) +
  97. xlab("log2FC") + ylab("-log10(Pvalue)") +
  98. scale_colour_manual(values = c(down='blue',normal='grey',up='red')) +
  99. geom_vline(xintercept=c(-(log2(1.5)),log2(1.5)),lty=2,col="black",lwd=0.6) +
  100. geom_hline(yintercept = -log10(0.05),lty=2,col="black",lwd=0.6)
  101. p
  102. # 添加top基因
  103. # 通过FC选取TOP10
  104. label <- data[order(abs(data$logFC),decreasing = T)[1:10],]
  105. # 通过pvalue选取TOP10
  106. #label <- data[order(abs(data$PValue),decreasing = F)[1:10],]
  107. label <- na.omit(label)
  108. p1 <- p + geom_point(size = 3, shape = 1, data = label) +
  109. ggrepel::geom_text_repel( aes(label = SYMBOL), data = label, color="black" )
  110. p1
  111. # 保存结果
  112. png(file = "result/5.Volcano_Plot.png",width = 900, height = 800, res=150)
  113. plot(p1)
  114. dev.off()
  115. rm(list = ls())
  116. options(stringsAsFactors = F)
  117. # 加载包
  118. library(pheatmap)
  119. library(tidyverse)
  120. # 加载原始表达矩阵
  121. lname <- load(file = "data/Step01-airwayData.Rdata")
  122. lname
  123. express_cpm1 <- rownames_to_column(as.data.frame(express_cpm) ,var = "ID")
  124. # 读取差异分析结果
  125. lname <- load(file = "data/Step03-edgeR_nrDEG.Rdata")
  126. lname
  127. # 提取所有差异表达的基因名
  128. edgeR_sigGene <- DEG_edgeR_symbol[DEG_edgeR_symbol$regulated!="normal",]
  129. head(edgeR_sigGene)
  130. data <- merge(edgeR_sigGene,express_cpm1,by.x = "ENSEMBL",by.y = "ID")
  131. data <- na.omit(data)
  132. data <- data[!duplicated(data$SYMBOL),]
  133. # 绘制热图
  134. dat <- select(data,starts_with("SRR"))
  135. rownames(dat) <- data$SYMBOL
  136. dat[1:4,1:4]
  137. anno <- data.frame(group=group$sample_title,row.names = group$run_accession)
  138. pheatmap(dat,scale = "row",show_colnames =T,
  139. show_rownames = F, cluster_cols = T,
  140. annotation_col=anno,
  141. main = "edgeR's DEG")
  142. # 显示指定symbol,这里随便展示10个基因symbol
  143. labels <- rep(x = "",times=nrow(dat))
  144. labels[1:10] <- rownames(dat)[1:10]
  145. pheatmap(dat,scale = "row",show_colnames =T,
  146. show_rownames = T,
  147. cluster_cols = T,
  148. annotation_col=anno,
  149. labels_row = labels,
  150. fontsize_row = 8,
  151. main = "edgeR's DEG")
  152. # 按照指定顺序绘制热图
  153. dex_exp <- express_cpm[,match(rownames(anno)[which(anno$group=="Dex")],
  154. colnames(express_cpm))]
  155. untreated_exp <- express_cpm[,match(rownames(anno)[which(anno$group=="untreated")],
  156. colnames(express_cpm))]
  157. data_new <- cbind(dex_exp, untreated_exp)
  158. dat1 <- data_new[match(edgeR_sigGene$ENSEMBL,rownames(data_new)),]
  159. pheatmap(dat1, scale = "row",show_colnames =T,show_rownames = F,
  160. cluster_cols = F,
  161. annotation_col=anno,
  162. main = "edgeR's DEG")

四、功能注释

学习KEGG

五、功能富集

1、GO_KEGG

rm(list = ls())
options(stringsAsFactors = F)

library(clusterProfiler)
library(org.Hs.eg.db)
library(GSEABase)
library(ggplot2)
library(tidyverse)


## Error in download.KEGG.Path(species)
# https://github.com/YuLab-SMU/clusterProfiler/pull/471
getOption("clusterProfiler.download.method")
#R.utils::setOption("clusterProfiler.download.method",'auto')
options(clusterProfiler.download.method = "wininet")
#options(clusterProfiler.download.method = "wget")
getOption("clusterProfiler.download.method")



# 读取差异分析结果
load(file = "data/Step03-edgeR_nrDEG.Rdata")
ls()

# 提取所有差异表达的基因名
DEG <- DEG_edgeR_symbol[DEG_edgeR_symbol$regulated!="normal",2]
head(DEG)

## ===GO数据库, 输出所有结果,后续可根据pvalue挑选结果
ego_CC <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="CC", pvalueCutoff= 1,qvalueCutoff= 1)
ego_MF <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="MF", pvalueCutoff= 1,qvalueCutoff= 1)
ego_BP <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="BP", pvalueCutoff= 1,qvalueCutoff= 1)

p_BP <- barplot(ego_BP,showCategory = 10) + ggtitle("Biological process")
p_CC <- barplot(ego_CC,showCategory = 10) + ggtitle("Cellular component")
p_MF <- barplot(ego_MF,showCategory = 10) + ggtitle("Molecular function")
plotc <- p_BP/p_CC/p_MF
plotc
ggsave('result/6.enrichGO.png', plotc, width = 10,height = 16)

ego_BP <- data.frame(ego_BP)
ego_CC <- data.frame(ego_CC)
ego_MF <- data.frame(ego_MF)
write.csv(ego_BP,'result/6.enrichGO_BP.csv')
write.csv(ego_CC,'result/6.enrichGO_CC.csv')
write.csv(ego_MF,'result/6.enrichGO_MF.csv')



## === KEGG
genelist <- bitr(gene=DEG, fromType="SYMBOL", toType="ENTREZID", OrgDb='org.Hs.eg.db')
genelist <- pull(genelist,ENTREZID)               
ekegg <- enrichKEGG(gene = genelist, organism = 'hsa', pvalueCutoff = 1, qvalueCutoff = 1)
p1 <- barplot(ekegg, showCategory=10)
p2 <- dotplot(ekegg, showCategory=10)
plotc = p1/p2
plotc
ggsave('result/6.enrichKEGG.png', plot = plotc, width = 8, height = 10)

ekegg <- data.frame(ekegg)
write.csv(ekegg,'result/6.enrichKEGG.csv')



## === 其他数据库通路
geneset <- read.gmt("data/MsigDB/v7.4/h.all.v7.4.symbols.gmt")
table(geneset$term)
geneset$term <- gsub(pattern = "HALLMARK_","", geneset$term)
geneset$term <- str_to_title(geneset$term)

my_path <- enricher(gene=DEG, pvalueCutoff = 1, qvalueCutoff = 1, TERM2GENE=geneset)
p1 <- barplot(my_path, showCategory=15,color = "pvalue")
p1
ggsave("result/6.enrich_HALLMARK.png", plot = p1, width = 8, height = 7)

my_path <- data.frame(my_path)
write.csv(my_path,"result/6.enrich_HALLMARK.csv")

2、GSEA

# 清空当前环境变量
rm(list = ls())
options(stringsAsFactors = F)

# 加载包
library(GSEABase)
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(stats)

# 加载数据
load("data/Step03-edgeR_nrDEG.Rdata")
DEG <- DEG_edgeR_symbol

## 构造GSEA分析数据
# 去掉没有配对上symbol的行
DEG <- DEG[!is.na(DEG$SYMBOL),]

# 去掉重复行
DEG <- DEG[!duplicated(DEG$SYMBOL),]

geneList <- DEG$logFC
names(geneList) <- DEG$SYMBOL
head(geneList)
geneList <- sort(geneList,decreasing = T)
head(geneList)
tail(geneList)


# 选择gmt文件
geneset <- read.gmt("data/MsigDB/v7.4/c5.go.bp.v7.4.symbols.gmt")

# 运行,输出全部结果
egmt <- GSEA(geneList, TERM2GENE=geneset, pvalueCutoff = 1)

#出点图
dotplot(egmt)

#按p值出点图
dotplot(egmt,color="pvalue")   

# 单个通路图
# 按照通路名
gseaplot2(egmt, "GOBP_MITOCHONDRIAL_GENOME_MAINTENANCE",  
          title = "GOBP_MITOCHONDRIAL_GENOME_MAINTENANCE")
# 按照行数
gseaplot2(egmt, 10, color="red", pvalue_table = T)

#按第一到第十个出图,不显示p值
gseaplot2(egmt, 1:10, color="red") 


# 保存结果
go_gsea <- as.data.frame(egmt@result)
write.csv(go_gsea,"result/6.gsea_go_fc.csv",row.names = F)

代码均来自于生信技能树张娟老师