获取更多R语言知识,请关注公众号:医学和生信笔记

    医学和生信笔记 公众号主要分享:1.医学小知识、肛肠科小知识;2.R语言和Python相关的数据分析、可视化、机器学习等;3.生物信息学学习资料和自己的学习笔记!

    从TCGA下载的counts,FPKM和miRNA是很多单个的文件,需要手动合成一个完整的表达矩阵,这是一个高频操作,因此写一段代码简化流程,部分代码来自生信技能树和生信星球。

    把以下代码保存为“get_TCGA_expr.R”,下次直接 source(get_TCGA_expr.R),就可以使用定义好的3个函数。
    get_TCGAcounts
    get_TCGAfpkm
    get_TCGAmirna

    1. ## 需要提供counts文件所在的文件夹目录和metadata路径
    2. ## 类似于 countsfiles <- "E:/TCGA_COADREAD/mirna/"
    3. ## metadata <- "E:/TCGA_COADREAD/mirna/metadata.cart.2021-05-31.json"
    4. # 获取counts表达矩阵,
    5. get_TCGAcounts <- function(rnaseqpath, metadatapath){
    6. allfiles <- list.files(path = rnaseqpath ,pattern='*.counts.gz$',recursive=T)
    7. mRNA_seq <- lapply(allfiles, function(x){
    8. result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
    9. return( result )
    10. })
    11. mRNA_df <- Reduce(cbind, mRNA_seq)
    12. tem <- nrow(mRNA_df)-5
    13. mRNA_df <- mRNA_df[1:tem, c(1,seq(2,ncol(mRNA_df),2))]
    14. save(mRNA_df,file = './TCGA_RNA.Rdata')
    15. #读取metadata信息
    16. meta <- jsonlite::fromJSON(metadatapath)
    17. entity <- meta$associated_entities
    18. tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
    19. file_name <- meta$file_name
    20. fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
    21. #把样本名(tcga_patient_id)变成表达矩阵的列名
    22. mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
    23. col_tcga <- fln2id[match(mis2,fln2id$filename),]
    24. colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
    25. rnaExpr <<- mRNA_df
    26. save(rnaExpr,file = "./TCGA_new_RNA.Rdata")
    27. print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))
    28. }
    29. # 获取FPKM表达矩阵
    30. get_TCGAfpkm <- function(rnaseqpath, metadatapath){
    31. allfiles <- list.files(path = rnaseqpath ,pattern='*.FPKM.txt.gz$',recursive=T)
    32. mRNA_seq <- lapply(allfiles, function(x){
    33. result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
    34. return( result )
    35. })
    36. mRNA_df <- Reduce(cbind, mRNA_seq)
    37. tem <- nrow(mRNA_df)-5
    38. mRNA_df <- mRNA_df[1:tem, c(1,seq(2,ncol(mRNA_df),2))]
    39. save(mRNA_df,file = './TCGA_RNA.Rdata')
    40. #读取metadata信息
    41. meta <- jsonlite::fromJSON(metadatapath)
    42. entity <- meta$associated_entities
    43. tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
    44. file_name <- meta$file_name
    45. fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
    46. #把样本名(tcga_patient_id)变成表达矩阵的列名
    47. mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
    48. col_tcga <- fln2id[match(mis2,fln2id$filename),]
    49. colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
    50. rnaExpr <<- mRNA_df
    51. save(rnaExpr,file = "./TCGA_new_RNA.Rdata")
    52. print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))
    53. }
    54. # 获取mirna表达矩阵
    55. get_TCGAmirna <- function(rnaseqpath, metadatapath){
    56. allfiles <- list.files(path = rnaseqpath ,pattern='*.mirnas.quantification.txt$',recursive=T)
    57. mRNA_seq <- lapply(allfiles, function(x){
    58. result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
    59. return( result )
    60. })
    61. mRNA_df <- Reduce(cbind, mRNA_seq)
    62. mRNA_df <- mRNA_df[, c(1,seq(2,ncol(mRNA_df),4))]
    63. save(mRNA_df,file = './TCGA_miRNA.Rdata')
    64. #读取metadata信息
    65. meta <- jsonlite::fromJSON(metadatapath)
    66. entity <- meta$associated_entities
    67. tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
    68. file_name <- meta$file_name
    69. fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
    70. #把样本名(tcga_patient_id)变成表达矩阵的列名
    71. mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
    72. col_tcga <- fln2id[match(mis2,fln2id$filename),]
    73. colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
    74. rnaExpr <<- mRNA_df
    75. save(rnaExpr,file = "./TCGA_miRNAexpr.Rdata")
    76. print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))

    获取更多R语言知识,请关注公众号:医学和生信笔记

    医学和生信笔记 公众号主要分享:1.医学小知识、肛肠科小知识;2.R语言和Python相关的数据分析、可视化、机器学习等;3.生物信息学学习资料和自己的学习笔记!