grep
grep:一种强大的文本搜索工具,它能使用正则表达式匹配模式搜索文本,并把匹配的行打印出来
格式: grep [options] pattern file
常见参数:[options]
-w:word 精确查找某个关键词pattern
-c:count 统计匹配成功的行的数量
-v:reverse 反向选择,即输出没有没有匹配的行
-n:number 显示匹配成功的行所在的行号
-r:recursion 从目录中递归查找pattern
-e:else 指定多个匹配模式
-f:file 从指定文件中读取要匹配的pattern
-i:ignore 忽略大小写
$ less Data/example.gtf |column -t| grep 'gene' -w| head -n 5
chr1 HAVANA gene 1737 4275 . + . gene_id "ENSG00000223972"; transcript_id "ENSG00000223972"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1"; level 2; havana_gene "OTTHUMG00000000961";
chr1 HAVANA gene 4226 19433 . - . gene_id "ENSG00000227232"; transcript_id "ENSG00000227232"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "WASH5P"; level 2; havana_gene "OTTHUMG00000000958";
chr1 HAVANA gene 19417 20972 . + . gene_id "ENSG00000243485"; transcript_id "ENSG00000243485"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "AL627309.6"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "AL627309.6"; level 2; havana_gene "OTTHUMG00000000959";
chr1 ENSEMBL gene 20229 20366 . + . gene_id "ENSG00000221311"; transcript_id "ENSG00000221311"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "hsa-mir-1302-2"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "hsa-mir-1302-2"; level 3;
chr1 HAVANA gene 24417 25944 . - . gene_id "ENSG00000237613"; transcript_id "ENSG00000237613"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "FAM138A"; level 2; havana_gene "OTTHUMG00000000960";
$ less Data/example.gtf |column -t| grep 'gene' -wv | head -n 5
chr1 ENSEMBL UTR 1737 2090 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL exon 1737 2090 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL transcript 1737 4275 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 HAVANA exon 1873 1920 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000450305"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP11-34P13-001"; level 2; havana_gene "OTTHUMG00000000961"; havana_transcript "OTTHUMT00000002844"; ont "PGO:0000005";
chr1 HAVANA transcript 1873 3533 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000450305"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP11-34P13-001"; level 2; havana_gene "OTTHUMG00000000961"; havana_transcript "OTTHUMT00000002844"; ont "PGO:0000005";
$ less Data/example.gtf |column -t| grep 'gene' -wc
20
$ less Data/example.gtf |column -t| grep -w -e 'gene' -e 'UTR'| head -n 5
chr1 ENSEMBL UTR 1737 2090 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 HAVANA gene 1737 4275 . + . gene_id "ENSG00000223972"; transcript_id "ENSG00000223972"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1"; level 2; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL UTR 2476 2584 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL UTR 3084 4021 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL UTR 4226 4561 . - . gene_id "ENSG00000227232"; transcript_id "ENST00000438504"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "WASH5P-203"; level 3; havana_gene "OTTHUMG00000000958";
$ cat >f.txt
gene
UTR
start_codon
stop_codon
^C
$ less Data/example.gtf |column -t| grep -w -f f.txt |head -n 5
chr1 ENSEMBL UTR 1737 2090 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 HAVANA gene 1737 4275 . + . gene_id "ENSG00000223972"; transcript_id "ENSG00000223972"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1"; level 2; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL UTR 2476 2584 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL UTR 3084 4021 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";
chr1 ENSEMBL start_codon 4022 4024 . + 0 gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "RP11-34P13.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "RP11-34P13.1-201"; level 3; havana_gene "OTTHUMG00000000961";