GTF文件

下载

用于将表达矩阵分为「CDs、NCDs」。

Ensembl http://asia.ensembl.org/info/data/ftp/index.html

ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens

GDC-TCGA 33种癌症Rdata准备 - 图1

比较神奇的是,chrome居然打不开这个网址(无论怎么清理cookie/换ip)

GDC-TCGA 33种癌症Rdata准备 - 图2

读入R

直接以” tabular format”读入。

在数据来源和分类里迷惑了半天,后来发现似乎并不需要考虑那么多…

  1. mygtf <- data.table::fread("Homo_sapiens.GRCh38.99.chr.gtf/Homo_sapiens.GRCh38.99.chr.gtf", sep = "\t")
  2. dim(mygtf)
  3. # [1] 2904340 9
  4. class(mygtf)
  5. # [1] "data.table" "data.frame"
  6. dim(mygtf)
  7. # [1] 2904340 9
  8. table(mygtf$V2)
  9. #
  10. # ensembl ensembl_havana ensembl_havana_tagene
  11. # 235548 690227 95
  12. # havana havana_tagene insdc
  13. # 1862976 109820 37
  14. # mirbase
  15. # 5637
  16. table(mygtf$V3)
  17. #
  18. # CDS exon five_prime_utr gene
  19. # 763375 1376244 153149 60617
  20. # Selenocysteine start_codon stop_codon three_prime_utr
  21. # 119 87814 80153 155118
  22. # transcript
  23. # 227751

miRBase is the central online repository for microRNA (miRNA) nomenclature, sequence data, annotation and target prediction. Griffiths-Jones S, Saini H K, van Dongen S, et al. miRBase: tools for microRNA genomics[J]. Nucleic acids research, 2007, 36(suppl_1): D154-D158

  1. mygtf$ensid <- substr(mygtf$V9,10,24)
  2. table(duplicated(mygtf$ensid))
  3. mygtf <- mygtf[!duplicated(mygtf$ensid),]
  4. dim(mygtf)
  5. # [1] 60617 10
  6. table(mygtf$V3)
  7. #
  8. # gene
  9. # 60617
  10. row.names(mygtf) <- mygtf$ensid
  11. genename <- strsplit(mygtf$V9,split = ";")
  12. start <- Sys.time()
  13. for(i in 1:nrow(mygtf)) {
  14. mygtf$genename[i] <- strsplit(genename[[i]][3],split = " ")[[1]][3]
  15. }
  16. end <- Sys.time()
  17. end - start
  18. # Time difference of 12.65077 mins
  19. mygtf$genename <- gsub('\"',"", mygtf$genename, fixed = TRUE)
  20. start <- Sys.time()
  21. for(i in 1:nrow(mygtf)) {
  22. mygtf$genetype[i] <- strsplit(genename[[i]][5],split = " ")[[1]][3]
  23. }
  24. end <- Sys.time()
  25. end - start
  26. # Time difference of 8.673628 mins
  27. mygtf$genetype <- gsub('\"',"", mygtf$genetype, fixed = TRUE)

将genetype分为两类,genetype含义参考:https://www.gencodegenes.org/pages/biotypes.html

  1. mygtf$group <- ifelse(mygtf$genetype=="IG_C_gene"|mygtf$genetype=="IG_D_gene"|
  2. mygtf$genetype=="IG_J_gene"|mygtf$genetype=="IG_V_gene"|
  3. mygtf$genetype=="protein_coding"|mygtf$genetype=="TEC"|
  4. mygtf$genetype=="TR_C_gene"|mygtf$genetype=="TR_D_gene"|
  5. mygtf$genetype=="TR_J_gene"|mygtf$genetype=="TR_V_gene",
  6. "CDS","other")
  7. table(mygtf$group)
  8. #
  9. # CDS other
  10. # 21407 39210
  11. cdsid <- mygtf[mygtf$group=="CDS",]$ensid
  12. otherid <- mygtf[mygtf$group=="other",]$ensid
  13. save(cdsid,file = "cdsid.Rdata")
  14. save(otherid,file = "otherid.Rdata")
  15. save(mygtf,file = "humangtf.Rdata")

将TCGA表达矩阵分为两个矩阵存为Rdata

选用 GDC-TCGA 33种癌症 https://xenabrowser.net/datapages/

  1. path <- "../13genes-signature/GDC-TCGA-data"
  2. filenames <- list.files(path,pattern = ".tsv")
  3. filePath <- sapply(filenames, function(x){
  4. paste(path,x,x,sep='/')})
  5. filePath <- filePath[c(TRUE, FALSE)]
  6. start <- Sys.time()
  7. expdata <- lapply(filePath, function(x){
  8. expr <- read.table(x,header = T,sep='\t')
  9. expr <- expr[1:(nrow(expr)-5),]
  10. expr[,1] <- as.character(expr[,1])
  11. for (j in 1:nrow(expr)) {
  12. expr[j,1] <- strsplit(expr[j,1],"\\.")[[1]][1]
  13. }
  14. row.names(expr) <- expr[,1]
  15. expr <- expr[,-1]
  16. expr <- log2(edgeR::cpm(expr)+1)
  17. expr <- na.omit(expr)
  18. })
  19. end <- Sys.time()
  20. end - start
  21. # Time difference of 37.246043 mins
  22. start <- Sys.time()
  23. for(i in 1:length(expdata)){
  24. expr <- expdata[[i]]
  25. cds_expr <- expr[intersect(row.names(expr),cdsid),]
  26. other_expr <- expr[intersect(row.names(expr),otherid),]
  27. save(cds_expr,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
  28. pattern = '-\\w*')),"_cds_expr.Rdata"))
  29. save(other_expr,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
  30. pattern = '-\\w*')),"_other_expr.Rdata"))
  31. }
  32. end <- Sys.time()
  33. end - start
  34. # Time difference of 3.861098 mins

得到了66个拆分后的表达矩阵.Rdata

GDC-TCGA 33种癌症Rdata准备 - 图3

生存数据存为Rdata

  1. path <- "../13genes-signature/GDC-TCGA-data"
  2. filenames <- list.files(path,pattern = ".tsv")
  3. filePath <- sapply(filenames, function(x){
  4. paste(path,x,x,sep='/')})
  5. filePath <- filePath[c(FALSE, TRUE)]
  6. start <- Sys.time()
  7. survdata <- lapply(filePath, function(x){
  8. phe <- read.table(x,header = T,sep='\t',
  9. fill = T,quote = "")
  10. phe$sample = gsub('-','.',phe$sample)
  11. row.names(phe) <- phe[,1]
  12. return(phe)
  13. # id <- intersect(colnames(expr),rownames(phe))
  14. # expr <- expr[,id]
  15. # phe <- phe[id,]
  16. })
  17. end <- Sys.time()
  18. end - start
  19. # Time difference of 0.5615041 secs
  20. start <- Sys.time()
  21. for(i in 1:length(survdata)){
  22. surv <- survdata[[i]]
  23. save(surv,file = paste0(sub("-","",stringr::str_extract(string = names(filePath)[i],
  24. pattern = '-\\w*')),"_surv.Rdata"))
  25. }
  26. end <- Sys.time()
  27. end - start
  28. # Time difference of 0.2563121 secs