GTF文件
下载
用于将表达矩阵分为「CDs、NCDs」。
Ensembl http://asia.ensembl.org/info/data/ftp/index.html
→ ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens
比较神奇的是,chrome居然打不开这个网址(无论怎么清理cookie/换ip)
读入R
直接以” tabular format”读入。
在数据来源和分类里迷惑了半天,后来发现似乎并不需要考虑那么多…
mygtf <- data.table::fread("Homo_sapiens.GRCh38.99.chr.gtf/Homo_sapiens.GRCh38.99.chr.gtf", sep = "\t")
dim(mygtf)
# [1] 2904340 9
class(mygtf)
# [1] "data.table" "data.frame"
dim(mygtf)
# [1] 2904340 9
table(mygtf$V2)
#
# ensembl ensembl_havana ensembl_havana_tagene
# 235548 690227 95
# havana havana_tagene insdc
# 1862976 109820 37
# mirbase
# 5637
table(mygtf$V3)
#
# CDS exon five_prime_utr gene
# 763375 1376244 153149 60617
# Selenocysteine start_codon stop_codon three_prime_utr
# 119 87814 80153 155118
# transcript
# 227751
miRBase is the central online repository for microRNA (miRNA) nomenclature, sequence data, annotation and target prediction. Griffiths-Jones S, Saini H K, van Dongen S, et al. miRBase: tools for microRNA genomics[J]. Nucleic acids research, 2007, 36(suppl_1): D154-D158
mygtf$ensid <- substr(mygtf$V9,10,24)
table(duplicated(mygtf$ensid))
mygtf <- mygtf[!duplicated(mygtf$ensid),]
dim(mygtf)
# [1] 60617 10
table(mygtf$V3)
#
# gene
# 60617
row.names(mygtf) <- mygtf$ensid
genename <- strsplit(mygtf$V9,split = ";")
start <- Sys.time()
for(i in 1:nrow(mygtf)) {
mygtf$genename[i] <- strsplit(genename[[i]][3],split = " ")[[1]][3]
}
end <- Sys.time()
end - start
# Time difference of 12.65077 mins
mygtf$genename <- gsub('\"',"", mygtf$genename, fixed = TRUE)
start <- Sys.time()
for(i in 1:nrow(mygtf)) {
mygtf$genetype[i] <- strsplit(genename[[i]][5],split = " ")[[1]][3]
}
end <- Sys.time()
end - start
# Time difference of 8.673628 mins
mygtf$genetype <- gsub('\"',"", mygtf$genetype, fixed = TRUE)
将genetype分为两类,genetype含义参考:https://www.gencodegenes.org/pages/biotypes.html
mygtf$group <- ifelse(mygtf$genetype=="IG_C_gene"|mygtf$genetype=="IG_D_gene"|
mygtf$genetype=="IG_J_gene"|mygtf$genetype=="IG_V_gene"|
mygtf$genetype=="protein_coding"|mygtf$genetype=="TEC"|
mygtf$genetype=="TR_C_gene"|mygtf$genetype=="TR_D_gene"|
mygtf$genetype=="TR_J_gene"|mygtf$genetype=="TR_V_gene",
"CDS","other")
table(mygtf$group)
#
# CDS other
# 21407 39210
cdsid <- mygtf[mygtf$group=="CDS",]$ensid
otherid <- mygtf[mygtf$group=="other",]$ensid
save(cdsid,file = "cdsid.Rdata")
save(otherid,file = "otherid.Rdata")
save(mygtf,file = "humangtf.Rdata")
将TCGA表达矩阵分为两个矩阵存为Rdata
选用 GDC-TCGA 33种癌症 https://xenabrowser.net/datapages/
path <- "../13genes-signature/GDC-TCGA-data"
filenames <- list.files(path,pattern = ".tsv")
filePath <- sapply(filenames, function(x){
paste(path,x,x,sep='/')})
filePath <- filePath[c(TRUE, FALSE)]
start <- Sys.time()
expdata <- lapply(filePath, function(x){
expr <- read.table(x,header = T,sep='\t')
expr <- expr[1:(nrow(expr)-5),]
expr[,1] <- as.character(expr[,1])
for (j in 1:nrow(expr)) {
expr[j,1] <- strsplit(expr[j,1],"\\.")[[1]][1]
}
row.names(expr) <- expr[,1]
expr <- expr[,-1]
expr <- log2(edgeR::cpm(expr)+1)
expr <- na.omit(expr)
})
end <- Sys.time()
end - start
# Time difference of 37.246043 mins
start <- Sys.time()
for(i in 1:length(expdata)){
expr <- expdata[[i]]
cds_expr <- expr[intersect(row.names(expr),cdsid),]
other_expr <- expr[intersect(row.names(expr),otherid),]
save(cds_expr,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
pattern = '-\\w*')),"_cds_expr.Rdata"))
save(other_expr,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
pattern = '-\\w*')),"_other_expr.Rdata"))
}
end <- Sys.time()
end - start
# Time difference of 3.861098 mins
得到了66个拆分后的表达矩阵.Rdata
生存数据存为Rdata
path <- "../13genes-signature/GDC-TCGA-data"
filenames <- list.files(path,pattern = ".tsv")
filePath <- sapply(filenames, function(x){
paste(path,x,x,sep='/')})
filePath <- filePath[c(FALSE, TRUE)]
start <- Sys.time()
survdata <- lapply(filePath, function(x){
phe <- read.table(x,header = T,sep='\t',
fill = T,quote = "")
phe$sample = gsub('-','.',phe$sample)
row.names(phe) <- phe[,1]
return(phe)
# id <- intersect(colnames(expr),rownames(phe))
# expr <- expr[,id]
# phe <- phe[id,]
})
end <- Sys.time()
end - start
# Time difference of 0.5615041 secs
start <- Sys.time()
for(i in 1:length(survdata)){
surv <- survdata[[i]]
save(surv,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
pattern = '-\\w*')),"_surv.Rdata"))
}
end <- Sys.time()
end - start
# Time difference of 0.2563121 secs