语雀:左手柳叶刀右手炭火烧
微信公众号:研平方 | 简书:研平方
关注可了解更多的科研教程及技巧。如有问题或建议,请留言。
欢迎关注我:一起学习,一起进步!
1. 套路一:观察重要表型与感兴趣基因之间的关系。
比如观察KRAS有无突变两组之间,感兴趣基因的表达是否有差异。
1.1 整理数据:提取感兴趣基因和临床表型数据
expr <- exprs[exprs$symbol %in% c("Your Interesting Gene Symbol"),]rownames(expr) <- expr$symbolexpr <- expr[,-1]expr <- as.data.frame(t(expr))str(expr)expr$Source.Name <- rownames(expr)comData <- merge(expr,pheno,by="Source.Name") #合并表达和表型数据
1.2 以表型为分组,观察感兴趣基因之间是否具有差异表达。
ibrary(reshape2)library(gridExtra)library(ggthemes)library(ggplot2)library(ggpubr)library(ggsignif)table(comData$Characteristics.krasmut.)df <- comData[comData$Characteristics.krasmut. !="not available",]table(df$Characteristics.krasmut.)plot <- ggplot(df,aes(x=Characteristics.krasmut., y="Your Gene", fill=Characteristics.krasmut.))+geom_boxplot(aes(fill=Characteristics.krasmut.), outlier.shape = NA)+geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+theme_pubclean() +scale_x_discrete(name="")+theme(plot.title = element_text(size = 13, face = "bold"),legend.position = "right",text = element_text(size = 13),axis.title = element_text(face="bold"),axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0))plot
2. 套路二:生存分析
library(survival)library(survminer)library(ggthemes)## remove the "not available" datadf <- comData[comData$Characteristics.os.delay. !="not available",]df$os <- as.numeric(df$Characteristics.os.delay.)df$event <- as.numeric(df$Characteristics.os.event.)survi_cutoff <- surv_cutpoint(data = df, time = "os", event = "event",variables = "Your gene")summary(survi_cutoff)sur_cut <- surv_categorize(survi_cutoff)head(sur_cut)sur_cut$os <- as.numeric(sur_cut$os)sur_cut$event <- as.numeric(sur_cut$event)fit <- survfit(Surv("os", "event") ~Your gene, data = sur_cut)ggsurvplot(fit, data = sur_cat, conf.int = T, # 95%CIsize = 0.6, # change line sizepval = T, # p-value of log-rank testrisk.table = TRUE,xlab=c("Overall survival(months)"),ggtheme = theme_gray(), # theme_minimal() theme_gray() theme_classic() theme_bw()legend.title="Your gene", legend.labs=c("High","Low") )
3. 套路三:ssgsva
3.1 make genelist and load expression data
## genelistrm(list = ls())library(GSVA)library(GSEABase)library(msigdbr)library(clusterProfiler)library(org.Hs.eg.db)library(enrichplot)library(limma)library(readxl)genelist <- read_excel("genelist.xlsx")genelist <- as.data.frame(genelist)pathway_list <- lapply(genelist, function(x) {unique(na.omit(x))})## expression dataload("expr.Rdata")row.names(expr) <- expr$symbolexprs <- dplyr::select(expr, -symbol)gsva <- gsva(as.matrix(exprs), pathway_list,method='ssgsea',kcdf='Gaussian',abs.ranking=TRUE)write.csv(gsva, file = "gsva.csv")## setting group according your interesting gene expressiondf <- exprs["Your Gene",]df <- as.data.frame(t(df))df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")table(df$group)gsva_matrix <- t(gsva)## Make sure the samples are in the correct orderdf <- df[rownames(gsva),]gsva_matrix <- cbind(gsva_matrix,df)
3.2 make analysis according the gsva result
library(reshape2)library(gridExtra)library(ggthemes)library(ggplot2)library(ggpubr)library(ggsignif)table(gsva_matrix$group)rt <- gsva_matrix[,c(1:29,31)]df <- melt(rt, id.vars = "group", variable.name = "immuneCells",value.name = "Expression")df$group <- factor(df$group, levels=c('High','Low'))plot <- ggplot(df,aes(x=immuneCells, y=Expression, fill=group))+geom_boxplot(aes(fill=group), outlier.shape = NA)+geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+#geom_signif(comparisons = compaired, map_signif_level = T)+stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+theme_pubclean() +scale_x_discrete(name="")+theme(plot.title = element_text(size = 13, face = "bold"),legend.position = "right",text = element_text(size = 13),axis.title = element_text(face="bold"),axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0))plot
4. 套路四:以单基因高低表达分组,limma差异分析
df <- exprs["Your Gene",]df <- as.data.frame(t(df))df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")table(df$group)library(limma)design.factor <- factor(df$group, levels=c('Low','High'))design.matrix <- model.matrix(~0+design.factor)colnames(design.matrix) <- levels(design.factor)design.matrixfit <- lmFit(exprs, design.matrix)cont.matrix <- makeContrasts(High-Low, levels=design.matrix)fit2 <- contrasts.fit(fit, cont.matrix)fit2 <- eBayes(fit2)DEG <- topTable(fit2,adjust="fdr",sort.by="B",number=Inf)head(DEG)DEG_filter <-subset(DEG, abs(logFC)>=1 & adj.P.Val<0.05)dim(DEG_filter)
5. 套路五:GO、KEGG差异分析
library(DOSE)library(GO.db)library(org.Hs.eg.db)library(GSEABase)library(clusterProfiler)## symbol ID transform to entre idsymbol=as.character(rownames(DEG_filter))eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")id = as.character(eg[,2])head(id)## MFegomf <- enrichGO(gene = id,OrgDb = org.Hs.eg.db,ont = "MF",pAdjustMethod = "BH",pvalueCutoff = 0.05,qvalueCutoff = 0.05,readable = TRUE)barplot(egomf, showCategory=15)dotplot(egomf)emapplot(egomf)write.csv(egomf@result,"egomf.csv")## CCegocc <- enrichGO(gene = id,OrgDb = org.Hs.eg.db,ont = "CC",pAdjustMethod = "BH",pvalueCutoff = 0.05,qvalueCutoff = 0.05,readable = TRUE)barplot(egocc, showCategory=15)dotplot(egocc)write.csv(egocc@result,"egocc.csv")## BPegobp <- enrichGO(gene = id,OrgDb = org.Hs.eg.db,ont = "BP",pAdjustMethod = "BH",pvalueCutoff = 0.05,qvalueCutoff = 0.05,readable = TRUE)barplot(egobp, showCategory=15)dotplot(egobp)emapplot(egobp)write.csv(egobp@result,"egobp.csv")kk <- enrichKEGG(gene = id,organism = 'hsa',pvalueCutoff = 0.05,pAdjustMethod = "BH",qvalueCutoff = 0.05)barplot(kk, showCategory=15)dotplot(kk)kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType="ENTREZID")write.csv(kk@result,"kegg.csv")browseKEGG(kk, 'hsa04972')
6. 套路六:GSEA
6.1 对limma差异分析的结果,进行整理
symbol=as.character(rownames(DEG_filter))eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")id = as.character(eg[,2])head(id)is.data.frame(eg)gene <- dplyr::distinct(eg, SYMBOL,.keep_all=TRUE)DEG_filter$SYMBOL <- rownames(DEG_filter)data_all_sort <- DEG_filter %>%dplyr::inner_join(gene, by="SYMBOL") %>%arrange(desc(logFC))head(data_all_sort)geneList = data_all_sort$logFC #把foldchange按照从大到小提取出来names(geneList) <- data_all_sort$ENTREZID #给上面提取的foldchange加上对应上ENTREZIDhead(geneList)ge = DEG_filter$logFCnames(ge) = rownames(DEG_filter)ge = sort(ge,decreasing = T)head(ge)
6.2 GSEA
library(msigdbr)GO_df = msigdbr(species = "Homo sapiens",category = "C5") %>%dplyr::select(gene_symbol,gs_exact_source,gs_subcat)dim(GO_df)length(unique(KEGG_df$gs_exact_source)) # 通路数量# GSEAlibrary(GSVA)em <- GSEA(ge, TERM2GENE = GO_df)gseaplot2(em, geneSetID = 1, title = em$Description[1])
7. 套路七:OncoScore文本挖掘,评分
已发表
8. 套路八:与重要基因的相关性分析(如免疫检查点基因)
已发表
9. 套路九:与单细胞测序数据结合
年后分享!
10. 套路十:oncomine,TIME,GEPIA等在线网站的使用
前九件套小编主要用于非TCGA数据的分析。
