dir <- c("import", "outport", "Rscript", "Rdata", "figures")for (i in 1:length(dir)){ if(!dir.exists(dir[i])) dir.create(dir[i])}get_disease_info = function(diseasename){ disease <- diseasename if(!dir.exists(paste0("import/", disease))) dir.create(paste0("import/", disease)) if(!dir.exists(paste0("import/", disease, "/clinical"))) dir.create(paste0("import/", disease, "/clinical")) if(!dir.exists(paste0("import/", disease, "/expdata"))) dir.create(paste0("import/", disease, "/expdata")) # 下载临床信息和表达矩阵 system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_cl.txt -d ./import/",disease,"/clinical/")) system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_exp.txt -d ./import/",disease,"/expdata/")) dir_cl <- paste0("import/", disease, "/clinical/") dir_exp <- paste0("import/", disease, "/expdata/") ## 临床信息整理 library(XML) xmls <- dir(dir_cl, pattern = "*.xml$", recursive = T) cl <- list() for(i in 1:length(xmls)){ result <- xmlParse(paste0(dir_cl,xmls[[i]])) rootnode <- xmlRoot(result) cl[[i]] <- xmlToDataFrame(rootnode[2]) } clinical <- do.call(rbind, cl) if(!dir.exists(paste0("Rdata/", disease))) dir.create(paste0("Rdata/", disease)) save(clinical, file = paste0("Rdata/", disease, "/clinical.Rdata")) ## 表达矩阵整理 counts <- dir(dir_exp, pattern = "*.htseq.counts.gz$", recursive = T) expdata <- list() for(i in 1:length(counts)){ expdata[[i]] <- read.table(paste0(dir_exp,counts[[i]]),row.names = 1,sep = "\t") } expdata <- do.call(cbind,expdata) # 添加列名 meta <- jsonlite::fromJSON(paste0("import/", disease, "/metadata.json")) ids <- meta$associated_entities ID <- sapply(ids,function(x){x[,1]}) file2id <- data.frame(file_name = meta$file_name, ID = ID) counts <- stringr::str_split(counts,"/",simplify = T)[,2] file2id <- file2id[match(counts, file2id$file_name),] colnames(expdata) <- file2id$ID # 将行名换成gene symbol # TCGA有自己的id转换方法 library(rtracklayer) gtf <- import("import/gencode.v22.annotation.gtf") gtf <- as.data.frame(gtf) gtf_gene <- gtf[gtf$type=="gene",] an <- gtf_gene[,c("gene_name","gene_id","gene_type")] expdata <- expdata[rownames(expdata) %in% an$gene_id,] an <- an[match(rownames(expdata),an$gene_id),] k <- !duplicated(an$gene_name);table(k) an <- an[k,] expdata <- expdata[k,] rownames(expdata) <- an$gene_name Group <- ifelse(stringr::str_sub(colnames(expdata),14 ,15)<10, "tumor", "normal") # 过滤低表达基因(玄学) number <- 0.5*ncol(expdata) expdata <- expdata[apply(expdata, 1, function(x) sum(x > 10) >number), ] save(expdata, Group, file = paste0("Rdata/", disease, "/expdata.Rdata"))}get_disease_info('ACC')