参考

公众号 | 生信技能树 | RNA-seq入门实战(一):上游数据下载、格式转化和质控清洗

1. 参考基因组数据整理

  1. cd $workspace
  2. mkdir genome # 参考基因组:genome.fa genome.gff
  3. mkdir 0_raw_data # RNA测序数据的下载位置
  4. mkdir 1_qc
  5. mkdir 2_mapping
  6. mkdir 3_counts
  7. mkdir 4_counts2TPM
  8. mkdir 5_clusterPCA
  9. mkdir 6_DEG
  10. mkdir 7_GOenrich
  • gff 转 gtf
  1. gffread ../genome.gff -T -o genome.gtf

2. 转录组数据整理

2.1 文件名和样本处理的对应

  • 所有分析步骤的结果文件,要有一个统一的命名标准
  • 进行每一步都要有特定的文件夹
  • 不同样本(S1)有多个重复(S1_1),每个重复有两个fq文件代表双端测序(S1_1.1和S1_1.2)
raw reads 原文件名(例如) raw reads clean reads 比对结果
Unknown_AS611-01T0001_good_1.fq.gz S1_1.1.fq.gz S1_1.1.qc.fq.gz S1_1.sort.bam
Unknown_AS611-01T0001_good_2.fq.gz S1_1.2.fq.gz S1_1.2.qc.fq.gz
  1. cd $workspace/0_raw_data
  2. ls *fq.gz > Sample.ID1.txt
  3. for sample in `echo 0 6 12 24 48 120`
  4. do
  5. for repeat in `seq 3`
  6. do
  7. for sequence in `echo 1 2`
  8. do
  9. echo -e "SY${sample}_${repeat}.${sequence}.fq.gz\tSY${sample}_${repeat}"
  10. done
  11. done
  12. done | paste Sample.ID1.txt - > Sample.ID.txt
  1. rm Sample.ID1.txt
  2. cat Sample.ID.txt
  3. ## Unknown_AS611-01T0001_good_1.fq.gz SY0_1.1.fq.gz SY0_1
  4. ## Unknown_AS611-01T0001_good_2.fq.gz SY0_1.2.fq.gz SY0_1
  5. ## Unknown_AS611-01T0002_good_1.fq.gz SY0_2.1.fq.gz SY0_2
  6. ## Unknown_AS611-01T0002_good_2.fq.gz SY0_2.2.fq.gz SY0_2
  7. ## Unknown_AS611-01T0003_good_1.fq.gz SY0_3.1.fq.gz SY0_3
  8. ## Unknown_AS611-01T0003_good_2.fq.gz SY0_3.2.fq.gz SY0_3
  9. ## Unknown_AS611-01T0019_good_1.fq.gz SY6_1.1.fq.gz SY6_1
  10. ## Unknown_AS611-01T0019_good_2.fq.gz SY6_1.2.fq.gz SY6_1
  11. ## Unknown_AS611-01T0020_good_1.fq.gz SY6_2.1.fq.gz SY6_2
  12. ## Unknown_AS611-01T0020_good_2.fq.gz SY6_2.2.fq.gz SY6_2
  13. ## Unknown_AS611-01T0021_good_1.fq.gz SY6_3.1.fq.gz SY6_3
  14. ## Unknown_AS611-01T0021_good_2.fq.gz SY6_3.2.fq.gz SY6_3
  15. ## ......
  16. awk '{print $3}' Sample.ID.txt | uniq > Sample.Group.txt
  17. cat Sample.Group.txt
  18. ## SY0_1
  19. ## SY0_2
  20. ## SY0_3
  21. ## SY6_1
  22. ## SY6_2
  23. ## SY6_3
  24. ## SY12_1
  25. ## SY12_2
  26. ## SY12_3
  27. ## SY24_1
  28. ## SY24_2
  29. ## SY24_3
  30. ## SY48_1
  31. ## SY48_2
  32. ## SY48_3
  33. ## SY120_1
  34. ## SY120_2
  35. ## SY120_3

2.2 文件改名

  1. awk '{print "mv " $1 " " $2}' Sample.ID.txt | bash