1. dir <- c("import", "outport", "Rscript", "Rdata", "figures")
    2. for (i in 1:length(dir)){
    3. if(!dir.exists(dir[i])) dir.create(dir[i])
    4. }
    5. get_disease_info = function(diseasename){
    6. disease <- diseasename
    7. if(!dir.exists(paste0("import/", disease)))
    8. dir.create(paste0("import/", disease))
    9. if(!dir.exists(paste0("import/", disease, "/clinical")))
    10. dir.create(paste0("import/", disease, "/clinical"))
    11. if(!dir.exists(paste0("import/", disease, "/expdata")))
    12. dir.create(paste0("import/", disease, "/expdata"))
    13. # 下载临床信息和表达矩阵
    14. system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_cl.txt -d ./import/",disease,"/clinical/"))
    15. system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_exp.txt -d ./import/",disease,"/expdata/"))
    16. dir_cl <- paste0("import/", disease, "/clinical/")
    17. dir_exp <- paste0("import/", disease, "/expdata/")
    18. ## 临床信息整理
    19. library(XML)
    20. xmls <- dir(dir_cl, pattern = "*.xml$", recursive = T)
    21. cl <- list()
    22. for(i in 1:length(xmls)){
    23. result <- xmlParse(paste0(dir_cl,xmls[[i]]))
    24. rootnode <- xmlRoot(result)
    25. cl[[i]] <- xmlToDataFrame(rootnode[2])
    26. }
    27. clinical <- do.call(rbind, cl)
    28. if(!dir.exists(paste0("Rdata/", disease)))
    29. dir.create(paste0("Rdata/", disease))
    30. save(clinical, file = paste0("Rdata/", disease, "/clinical.Rdata"))
    31. ## 表达矩阵整理
    32. counts <- dir(dir_exp, pattern = "*.htseq.counts.gz$", recursive = T)
    33. expdata <- list()
    34. for(i in 1:length(counts)){
    35. expdata[[i]] <- read.table(paste0(dir_exp,counts[[i]]),row.names = 1,sep = "\t")
    36. }
    37. expdata <- do.call(cbind,expdata)
    38. # 添加列名
    39. meta <- jsonlite::fromJSON(paste0("import/", disease, "/metadata.json"))
    40. ids <- meta$associated_entities
    41. ID <- sapply(ids,function(x){x[,1]})
    42. file2id <- data.frame(file_name = meta$file_name,
    43. ID = ID)
    44. counts <- stringr::str_split(counts,"/",simplify = T)[,2]
    45. file2id <- file2id[match(counts, file2id$file_name),]
    46. colnames(expdata) <- file2id$ID
    47. # 将行名换成gene symbol
    48. # TCGA有自己的id转换方法
    49. library(rtracklayer)
    50. gtf <- import("import/gencode.v22.annotation.gtf")
    51. gtf <- as.data.frame(gtf)
    52. gtf_gene <- gtf[gtf$type=="gene",]
    53. an <- gtf_gene[,c("gene_name","gene_id","gene_type")]
    54. expdata <- expdata[rownames(expdata) %in% an$gene_id,]
    55. an <- an[match(rownames(expdata),an$gene_id),]
    56. k <- !duplicated(an$gene_name);table(k)
    57. an <- an[k,]
    58. expdata <- expdata[k,]
    59. rownames(expdata) <- an$gene_name
    60. Group <- ifelse(stringr::str_sub(colnames(expdata),14 ,15)<10, "tumor", "normal")
    61. # 过滤低表达基因(玄学)
    62. number <- 0.5*ncol(expdata)
    63. expdata <- expdata[apply(expdata, 1, function(x) sum(x > 10) >number), ]
    64. save(expdata, Group, file = paste0("Rdata/", disease, "/expdata.Rdata"))
    65. }
    66. get_disease_info('ACC')