参考
公众号 | 生信技能树 | RNA-seq入门实战(一):上游数据下载、格式转化和质控清洗
1. 参考基因组数据整理
cd $workspace
mkdir genome # 参考基因组:genome.fa genome.gff
mkdir 0_raw_data # RNA测序数据的下载位置
mkdir 1_qc
mkdir 2_mapping
mkdir 3_counts
mkdir 4_counts2TPM
mkdir 5_clusterPCA
mkdir 6_DEG
mkdir 7_GOenrich
- gff 转 gtf
gffread ../genome.gff -T -o genome.gtf
2. 转录组数据整理
2.1 文件名和样本处理的对应
- 所有分析步骤的结果文件,要有一个统一的命名标准
- 进行每一步都要有特定的文件夹
- 不同样本(S1)有多个重复(S1_1),每个重复有两个fq文件代表双端测序(S1_1.1和S1_1.2)
raw reads 原文件名(例如) | raw reads | clean reads | 比对结果 |
---|---|---|---|
Unknown_AS611-01T0001_good_1.fq.gz | S1_1.1.fq.gz | S1_1.1.qc.fq.gz | S1_1.sort.bam |
Unknown_AS611-01T0001_good_2.fq.gz | S1_1.2.fq.gz | S1_1.2.qc.fq.gz | |
… | … | … | … |
cd $workspace/0_raw_data
ls *fq.gz > Sample.ID1.txt
for sample in `echo 0 6 12 24 48 120`
do
for repeat in `seq 3`
do
for sequence in `echo 1 2`
do
echo -e "SY${sample}_${repeat}.${sequence}.fq.gz\tSY${sample}_${repeat}"
done
done
done | paste Sample.ID1.txt - > Sample.ID.txt
rm Sample.ID1.txt
cat Sample.ID.txt
## Unknown_AS611-01T0001_good_1.fq.gz SY0_1.1.fq.gz SY0_1
## Unknown_AS611-01T0001_good_2.fq.gz SY0_1.2.fq.gz SY0_1
## Unknown_AS611-01T0002_good_1.fq.gz SY0_2.1.fq.gz SY0_2
## Unknown_AS611-01T0002_good_2.fq.gz SY0_2.2.fq.gz SY0_2
## Unknown_AS611-01T0003_good_1.fq.gz SY0_3.1.fq.gz SY0_3
## Unknown_AS611-01T0003_good_2.fq.gz SY0_3.2.fq.gz SY0_3
## Unknown_AS611-01T0019_good_1.fq.gz SY6_1.1.fq.gz SY6_1
## Unknown_AS611-01T0019_good_2.fq.gz SY6_1.2.fq.gz SY6_1
## Unknown_AS611-01T0020_good_1.fq.gz SY6_2.1.fq.gz SY6_2
## Unknown_AS611-01T0020_good_2.fq.gz SY6_2.2.fq.gz SY6_2
## Unknown_AS611-01T0021_good_1.fq.gz SY6_3.1.fq.gz SY6_3
## Unknown_AS611-01T0021_good_2.fq.gz SY6_3.2.fq.gz SY6_3
## ......
awk '{print $3}' Sample.ID.txt | uniq > Sample.Group.txt
cat Sample.Group.txt
## SY0_1
## SY0_2
## SY0_3
## SY6_1
## SY6_2
## SY6_3
## SY12_1
## SY12_2
## SY12_3
## SY24_1
## SY24_2
## SY24_3
## SY48_1
## SY48_2
## SY48_3
## SY120_1
## SY120_2
## SY120_3
2.2 文件改名
awk '{print "mv " $1 " " $2}' Sample.ID.txt | bash