语雀:左手柳叶刀右手炭火烧
微信公众号:研平方 | 简书:研平方
关注可了解更多的科研教程及技巧。如有问题或建议,请留言。
欢迎关注我:一起学习,一起进步!

1. 套路一:观察重要表型与感兴趣基因之间的关系。

比如观察KRAS有无突变两组之间,感兴趣基因的表达是否有差异。

1.1 整理数据:提取感兴趣基因和临床表型数据

  1. expr <- exprs[exprs$symbol %in% c("Your Interesting Gene Symbol"),]
  2. rownames(expr) <- expr$symbol
  3. expr <- expr[,-1]
  4. expr <- as.data.frame(t(expr))
  5. str(expr)
  6. expr$Source.Name <- rownames(expr)
  7. comData <- merge(expr,pheno,by="Source.Name") #合并表达和表型数据

1.2 以表型为分组,观察感兴趣基因之间是否具有差异表达。

  1. ibrary(reshape2)
  2. library(gridExtra)
  3. library(ggthemes)
  4. library(ggplot2)
  5. library(ggpubr)
  6. library(ggsignif)
  7. table(comData$Characteristics.krasmut.)
  8. df <- comData[comData$Characteristics.krasmut. !="not available",]
  9. table(df$Characteristics.krasmut.)
  10. plot <- ggplot(df,aes(x=Characteristics.krasmut., y="Your Gene", fill=Characteristics.krasmut.))+
  11. geom_boxplot(aes(fill=Characteristics.krasmut.), outlier.shape = NA)+
  12. geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+
  13. stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
  14. theme_pubclean() +
  15. scale_x_discrete(name="")+
  16. theme(plot.title = element_text(size = 13, face = "bold"),
  17. legend.position = "right",
  18. text = element_text(size = 13),
  19. axis.title = element_text(face="bold"),
  20. axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0))
  21. plot

2. 套路二:生存分析

  1. library(survival)
  2. library(survminer)
  3. library(ggthemes)
  4. ## remove the "not available" data
  5. df <- comData[comData$Characteristics.os.delay. !="not available",]
  6. df$os <- as.numeric(df$Characteristics.os.delay.)
  7. df$event <- as.numeric(df$Characteristics.os.event.)
  8. survi_cutoff <- surv_cutpoint(data = df, time = "os", event = "event",
  9. variables = "Your gene")
  10. summary(survi_cutoff)
  11. sur_cut <- surv_categorize(survi_cutoff)
  12. head(sur_cut)
  13. sur_cut$os <- as.numeric(sur_cut$os)
  14. sur_cut$event <- as.numeric(sur_cut$event)
  15. fit <- survfit(Surv("os", "event") ~Your gene, data = sur_cut)
  16. ggsurvplot(fit, data = sur_cat, conf.int = T, # 95%CI
  17. size = 0.6, # change line size
  18. pval = T, # p-value of log-rank test
  19. risk.table = TRUE,
  20. xlab=c("Overall survival(months)"),
  21. ggtheme = theme_gray(), # theme_minimal() theme_gray() theme_classic() theme_bw()
  22. legend.title="Your gene", legend.labs=c("High","Low") )

3. 套路三:ssgsva

3.1 make genelist and load expression data

  1. ## genelist
  2. rm(list = ls())
  3. library(GSVA)
  4. library(GSEABase)
  5. library(msigdbr)
  6. library(clusterProfiler)
  7. library(org.Hs.eg.db)
  8. library(enrichplot)
  9. library(limma)
  10. library(readxl)
  11. genelist <- read_excel("genelist.xlsx")
  12. genelist <- as.data.frame(genelist)
  13. pathway_list <- lapply(genelist, function(x) {
  14. unique(na.omit(x))
  15. })
  16. ## expression data
  17. load("expr.Rdata")
  18. row.names(expr) <- expr$symbol
  19. exprs <- dplyr::select(expr, -symbol)
  20. gsva <- gsva(as.matrix(exprs), pathway_list,method='ssgsea',
  21. kcdf='Gaussian',abs.ranking=TRUE)
  22. write.csv(gsva, file = "gsva.csv")
  23. ## setting group according your interesting gene expression
  24. df <- exprs["Your Gene",]
  25. df <- as.data.frame(t(df))
  26. df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
  27. table(df$group)
  28. gsva_matrix <- t(gsva)
  29. ## Make sure the samples are in the correct order
  30. df <- df[rownames(gsva),]
  31. gsva_matrix <- cbind(gsva_matrix,df)

3.2 make analysis according the gsva result

  1. library(reshape2)
  2. library(gridExtra)
  3. library(ggthemes)
  4. library(ggplot2)
  5. library(ggpubr)
  6. library(ggsignif)
  7. table(gsva_matrix$group)
  8. rt <- gsva_matrix[,c(1:29,31)]
  9. df <- melt(rt, id.vars = "group", variable.name = "immuneCells",
  10. value.name = "Expression")
  11. df$group <- factor(df$group, levels=c('High','Low'))
  12. plot <- ggplot(df,aes(x=immuneCells, y=Expression, fill=group))+
  13. geom_boxplot(aes(fill=group), outlier.shape = NA)+
  14. geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+
  15. #geom_signif(comparisons = compaired, map_signif_level = T)+
  16. stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
  17. theme_pubclean() +
  18. scale_x_discrete(name="")+
  19. theme(plot.title = element_text(size = 13, face = "bold"),
  20. legend.position = "right",
  21. text = element_text(size = 13),
  22. axis.title = element_text(face="bold"),
  23. axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0))
  24. plot

4. 套路四:以单基因高低表达分组,limma差异分析

  1. df <- exprs["Your Gene",]
  2. df <- as.data.frame(t(df))
  3. df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
  4. table(df$group)
  5. library(limma)
  6. design.factor <- factor(df$group, levels=c('Low','High'))
  7. design.matrix <- model.matrix(~0+design.factor)
  8. colnames(design.matrix) <- levels(design.factor)
  9. design.matrix
  10. fit <- lmFit(exprs, design.matrix)
  11. cont.matrix <- makeContrasts(High-Low, levels=design.matrix)
  12. fit2 <- contrasts.fit(fit, cont.matrix)
  13. fit2 <- eBayes(fit2)
  14. DEG <- topTable(fit2,adjust="fdr",sort.by="B",number=Inf)
  15. head(DEG)
  16. DEG_filter <-subset(DEG, abs(logFC)>=1 & adj.P.Val<0.05)
  17. dim(DEG_filter)

5. 套路五:GO、KEGG差异分析

  1. library(DOSE)
  2. library(GO.db)
  3. library(org.Hs.eg.db)
  4. library(GSEABase)
  5. library(clusterProfiler)
  6. ## symbol ID transform to entre id
  7. symbol=as.character(rownames(DEG_filter))
  8. eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
  9. id = as.character(eg[,2])
  10. head(id)
  11. ## MF
  12. egomf <- enrichGO(gene = id,
  13. OrgDb = org.Hs.eg.db,
  14. ont = "MF",
  15. pAdjustMethod = "BH",
  16. pvalueCutoff = 0.05,
  17. qvalueCutoff = 0.05,
  18. readable = TRUE)
  19. barplot(egomf, showCategory=15)
  20. dotplot(egomf)
  21. emapplot(egomf)
  22. write.csv(egomf@result,"egomf.csv")
  23. ## CC
  24. egocc <- enrichGO(gene = id,
  25. OrgDb = org.Hs.eg.db,
  26. ont = "CC",
  27. pAdjustMethod = "BH",
  28. pvalueCutoff = 0.05,
  29. qvalueCutoff = 0.05,
  30. readable = TRUE)
  31. barplot(egocc, showCategory=15)
  32. dotplot(egocc)
  33. write.csv(egocc@result,"egocc.csv")
  34. ## BP
  35. egobp <- enrichGO(gene = id,
  36. OrgDb = org.Hs.eg.db,
  37. ont = "BP",
  38. pAdjustMethod = "BH",
  39. pvalueCutoff = 0.05,
  40. qvalueCutoff = 0.05,
  41. readable = TRUE)
  42. barplot(egobp, showCategory=15)
  43. dotplot(egobp)
  44. emapplot(egobp)
  45. write.csv(egobp@result,"egobp.csv")
  46. kk <- enrichKEGG(gene = id,
  47. organism = 'hsa',
  48. pvalueCutoff = 0.05,
  49. pAdjustMethod = "BH",
  50. qvalueCutoff = 0.05)
  51. barplot(kk, showCategory=15)
  52. dotplot(kk)
  53. kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
  54. write.csv(kk@result,"kegg.csv")
  55. browseKEGG(kk, 'hsa04972')

6. 套路六:GSEA

6.1 对limma差异分析的结果,进行整理

  1. symbol=as.character(rownames(DEG_filter))
  2. eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
  3. id = as.character(eg[,2])
  4. head(id)
  5. is.data.frame(eg)
  6. gene <- dplyr::distinct(eg, SYMBOL,.keep_all=TRUE)
  7. DEG_filter$SYMBOL <- rownames(DEG_filter)
  8. data_all_sort <- DEG_filter %>%
  9. dplyr::inner_join(gene, by="SYMBOL") %>%
  10. arrange(desc(logFC))
  11. head(data_all_sort)
  12. geneList = data_all_sort$logFC #把foldchange按照从大到小提取出来
  13. names(geneList) <- data_all_sort$ENTREZID #给上面提取的foldchange加上对应上ENTREZID
  14. head(geneList)
  15. ge = DEG_filter$logFC
  16. names(ge) = rownames(DEG_filter)
  17. ge = sort(ge,decreasing = T)
  18. head(ge)

6.2 GSEA

  1. library(msigdbr)
  2. GO_df = msigdbr(species = "Homo sapiens",category = "C5") %>%
  3. dplyr::select(gene_symbol,gs_exact_source,gs_subcat)
  4. dim(GO_df)
  5. length(unique(KEGG_df$gs_exact_source)) # 通路数量
  6. # GSEA
  7. library(GSVA)
  8. em <- GSEA(ge, TERM2GENE = GO_df)
  9. gseaplot2(em, geneSetID = 1, title = em$Description[1])

7. 套路七:OncoScore文本挖掘,评分

已发表

8. 套路八:与重要基因的相关性分析(如免疫检查点基因)

已发表

9. 套路九:与单细胞测序数据结合

年后分享!

10. 套路十:oncomine,TIME,GEPIA等在线网站的使用

前九件套小编主要用于非TCGA数据的分析。