TCGA下载数据,并注释

    1. #下载数据
    2. proj = "TCGA-HNSC"
    3. download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0(proj,".htseq_counts.tsv.gz"))
    4. #表达矩阵行名ID转换
    5. dat = read.table(paste0(proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
    6. ##逆转log
    7. dat = as.matrix(2^dat - 1)
    8. dat[1:4,1:4]
    9. ### 深坑一个
    10. dat[97,9]
    11. as.character(dat[97,9])
    12. ### 用apply转换为整数矩阵
    13. exp = apply(dat, 2, as.integer)
    14. exp[1:4,1:4] #行名消失
    15. rownames(exp) = rownames(dat)
    16. exp[1:4,1:4]
    17. #表达矩阵过滤
    18. nrow(exp)
    19. ##常用过滤标准1:
    20. ###仅去除在所有样本里表达量都为零的基因
    21. exp1 = exp[rowSums(exp)>0,]
    22. nrow(exp1)
    23. ##常用过滤标准2(推荐):
    24. ###仅保留在一半以上样本里表达的基因
    25. exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
    26. nrow(exp)
    27. #分组信息获取
    28. ##根据样本ID的第14-15位,给样本分组(tumor和normal)
    29. library(dplyr)
    30. library(tidyverse)
    31. table(str_sub(colnames(exp),14,15))
    32. ###分组
    33. Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
    34. Group = factor(Group,levels = c("normal","tumor"))
    35. table(Group)
    36. save(exp,Group,proj,file = paste0(proj,".Rdata"))

    标准化数据

    1. rm(list = ls())
    2. load("TCGA-HNSC.Rdata")
    3. table(Group)
    4. #deseq2----
    5. library(DESeq2)
    6. colData <- data.frame(row.names =colnames(exp),
    7. condition=Group)
    8. if(!file.exists(paste0(proj,"_dd.Rdata"))){
    9. dds <- DESeqDataSetFromMatrix(
    10. countData = exp,
    11. colData = colData,
    12. design = ~ condition)
    13. dds <- DESeq(dds)
    14. save(dds,file = paste0(proj,"_dd.Rdata"))
    15. }
    16. load(file = paste0(proj,"_dd.Rdata"))
    17. class(dds)
    18. res <- results(dds, contrast = c("condition",rev(levels(Group))))
    19. c("condition",rev(levels(Group)))
    20. class(res)
    21. DEG1 <- as.data.frame(res)
    22. DEG1 <- DEG1[order(DEG1$pvalue),]
    23. DEG1 = na.omit(DEG1)
    24. head(DEG1)
    25. #添加change列标记基因上调下调
    26. logFC_t = 2
    27. pvalue_t = 0.05
    28. k1 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange < -logFC_t);table(k1)
    29. k2 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange > logFC_t);table(k2)
    30. DEG1$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
    31. table(DEG1$change)
    32. head(DEG1)
    33. #edgeR----
    34. library(edgeR)
    35. dge <- DGEList(counts=exp,group=Group)
    36. dge$samples$lib.size <- colSums(dge$counts)
    37. dge <- calcNormFactors(dge)
    38. design <- model.matrix(~Group)
    39. dge <- estimateGLMCommonDisp(dge, design)
    40. dge <- estimateGLMTrendedDisp(dge, design)
    41. dge <- estimateGLMTagwiseDisp(dge, design)
    42. fit <- glmFit(dge, design)
    43. fit <- glmLRT(fit)
    44. DEG2=topTags(fit, n=Inf)
    45. class(DEG2)
    46. DEG2=as.data.frame(DEG2)
    47. head(DEG2)
    48. k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t);table(k1)
    49. k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t);table(k2)
    50. DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
    51. head(DEG2)
    52. table(DEG2$change)
    53. ###limma----
    54. library(limma)
    55. dge <- DGEList(counts=exp)
    56. dge <- calcNormFactors(dge)
    57. v <- voom(dge,design, normalize="quantile")
    58. design <- model.matrix(~Group)
    59. fit <- lmFit(v, design)
    60. fit= eBayes(fit)
    61. DEG3 = topTable(fit, coef=2, n=Inf)
    62. DEG3 = na.omit(DEG3)
    63. k1 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC < -logFC_t);table(k1)
    64. k2 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC > logFC_t);table(k2)
    65. DEG3$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
    66. table(DEG3$change)
    67. head(DEG3)
    68. tj = data.frame(deseq2 = as.integer(table(DEG1$change)),
    69. edgeR = as.integer(table(DEG2$change)),
    70. limma_voom = as.integer(table(DEG3$change)),
    71. row.names = c("down","not","up")
    72. );tj
    73. save(DEG1,DEG2,DEG3,Group,tj,file = paste0(proj,"_DEG.Rdata"))

    PCA图

    1. rm(list = ls())
    2. library(ggplot2)
    3. library(tinyarray)
    4. load("TCGA-HNSC.Rdata")
    5. load("TCGA-HNSC_DEG.Rdata")
    6. exp[1:4,1:4]
    7. ncol(exp)
    8. # cpm 去除文库大小的影响
    9. dat = log2(cpm(exp)+1)
    10. pca.plot = draw_pca(dat,Group);pca.plot
    11. save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))

    TCGA作图,拼图

    1. install.packages('VennDiagram')
    2. library('VennDiagram')
    3. UP=function(df){
    4. rownames(df)[df$change=="UP"]
    5. }
    6. DOWN=function(df){
    7. rownames(df)[df$change=="DOWN"]
    8. }
    9. up = intersect(intersect(UP(DEG1),UP(DEG2)),UP(DEG3))
    10. down = intersect(intersect(DOWN(DEG1),DOWN(DEG2)),DOWN(DEG3))
    11. dat = log2(cpm(exp)+1)
    12. hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)
    13. #上调、下调基因分别画维恩图
    14. up_genes = list(Deseq2 = UP(DEG1),
    15. edgeR = UP(DEG2),
    16. limma = UP(DEG3))
    17. down_genes = list(Deseq2 = DOWN(DEG1),
    18. edgeR = DOWN(DEG2),
    19. limma = DOWN(DEG3))
    20. up.plot <- draw_venn(up_genes,"UPgene")
    21. down.plot <- draw_venn(down_genes,"DOWNgene")
    22. #维恩图拼图,终于搞定
    23. library(patchwork)
    24. #up.plot + down.plot
    25. # 拼图
    26. pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")+
    27. plot_annotation(title = "HNSC",
    28. tag_levels = "A")
    29. ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)

    image.png