Homo_sapiens.GRCh38.99.gtf 是人类参考基因组注释,在ensambel下载。

    1. options(stringsAsFactors = F)
    2. qe = data.table::fread("Homo_sapiens.GRCh38.99.gtf",sep = "\t",header = F,fill = T,skip = 5)
    3. x = qe$V9
    4. library(tidyr)
    5. library(dplyr)
    6. library(stringr)
    7. table(str_count(x[1:100],";"))
    8. x2 = str_split(x,";",simplify = T)
    9. x3 = str_subset(x2,"transcript_biotype")
    10. head(x3)
    11. x3 = str_remove(x3," transcript_biotype \"" )
    12. x3 = str_remove(x3,"\"")
    13. table(x3)
     IG_C_gene 
                                   266 
                       IG_C_pseudogene 
                                    24 
                             IG_D_gene 
                                   115 
                             IG_J_gene 
                                    58 
                       IG_J_pseudogene 
                                     6 
                         IG_pseudogene 
                                     2 
                             IG_V_gene 
                                   985 
                       IG_V_pseudogene 
                                   476 
                                lncRNA 
                                220139 
                                 miRNA 
                                  3758 
                              misc_RNA 
                                  4470 
                               Mt_rRNA 
                                     4 
                               Mt_tRNA 
                                    44 
               nonsense_mediated_decay 
                                366812 
                        non_stop_decay 
                                  1464 
                polymorphic_pseudogene 
                                  1207 
                  processed_pseudogene 
                                 21963 
                  processed_transcript 
                                156564 
                        protein_coding 
                               1881424 
                            pseudogene 
                                   144 
                       retained_intron 
                                153856 
                              ribozyme 
                                    16 
                                  rRNA 
                                   116 
                       rRNA_pseudogene 
                                   998 
                                scaRNA 
                                    98 
                                 scRNA 
                                     2 
                                snoRNA 
                                  1908 
                                 snRNA 
                                  3820 
                                  sRNA 
                                    10 
                                   TEC 
                                  2317 
      transcribed_processed_pseudogene 
                                  1162 
        transcribed_unitary_pseudogene 
                                  1406 
    transcribed_unprocessed_pseudogene 
                                  6659 
       translated_processed_pseudogene 
                                     4 
     translated_unprocessed_pseudogene 
                                     8 
                             TR_C_gene 
                                    64 
                             TR_D_gene 
                                    12 
                             TR_J_gene 
                                   237 
                       TR_J_pseudogene 
                                     8 
                             TR_V_gene 
                                   717 
                       TR_V_pseudogene 
                                    90 
                    unitary_pseudogene 
                                   376 
                unprocessed_pseudogene 
                                 10567 
                              vaultRNA 
                                     2