dir <- c("import", "outport", "Rscript", "Rdata", "figures")
for (i in 1:length(dir)){
if(!dir.exists(dir[i])) dir.create(dir[i])
}
get_disease_info = function(diseasename){
disease <- diseasename
if(!dir.exists(paste0("import/", disease)))
dir.create(paste0("import/", disease))
if(!dir.exists(paste0("import/", disease, "/clinical")))
dir.create(paste0("import/", disease, "/clinical"))
if(!dir.exists(paste0("import/", disease, "/expdata")))
dir.create(paste0("import/", disease, "/expdata"))
# 下载临床信息和表达矩阵
system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_cl.txt -d ./import/",disease,"/clinical/"))
system(paste0("./gdc-client.exe download -m ./import/",disease,"/gdc_manifest_exp.txt -d ./import/",disease,"/expdata/"))
dir_cl <- paste0("import/", disease, "/clinical/")
dir_exp <- paste0("import/", disease, "/expdata/")
## 临床信息整理
library(XML)
xmls <- dir(dir_cl, pattern = "*.xml$", recursive = T)
cl <- list()
for(i in 1:length(xmls)){
result <- xmlParse(paste0(dir_cl,xmls[[i]]))
rootnode <- xmlRoot(result)
cl[[i]] <- xmlToDataFrame(rootnode[2])
}
clinical <- do.call(rbind, cl)
if(!dir.exists(paste0("Rdata/", disease)))
dir.create(paste0("Rdata/", disease))
save(clinical, file = paste0("Rdata/", disease, "/clinical.Rdata"))
## 表达矩阵整理
counts <- dir(dir_exp, pattern = "*.htseq.counts.gz$", recursive = T)
expdata <- list()
for(i in 1:length(counts)){
expdata[[i]] <- read.table(paste0(dir_exp,counts[[i]]),row.names = 1,sep = "\t")
}
expdata <- do.call(cbind,expdata)
# 添加列名
meta <- jsonlite::fromJSON(paste0("import/", disease, "/metadata.json"))
ids <- meta$associated_entities
ID <- sapply(ids,function(x){x[,1]})
file2id <- data.frame(file_name = meta$file_name,
ID = ID)
counts <- stringr::str_split(counts,"/",simplify = T)[,2]
file2id <- file2id[match(counts, file2id$file_name),]
colnames(expdata) <- file2id$ID
# 将行名换成gene symbol
# TCGA有自己的id转换方法
library(rtracklayer)
gtf <- import("import/gencode.v22.annotation.gtf")
gtf <- as.data.frame(gtf)
gtf_gene <- gtf[gtf$type=="gene",]
an <- gtf_gene[,c("gene_name","gene_id","gene_type")]
expdata <- expdata[rownames(expdata) %in% an$gene_id,]
an <- an[match(rownames(expdata),an$gene_id),]
k <- !duplicated(an$gene_name);table(k)
an <- an[k,]
expdata <- expdata[k,]
rownames(expdata) <- an$gene_name
Group <- ifelse(stringr::str_sub(colnames(expdata),14 ,15)<10, "tumor", "normal")
# 过滤低表达基因(玄学)
number <- 0.5*ncol(expdata)
expdata <- expdata[apply(expdata, 1, function(x) sum(x > 10) >number), ]
save(expdata, Group, file = paste0("Rdata/", disease, "/expdata.Rdata"))
}
get_disease_info('ACC')