1.参考基因组的准备
2.Hisat2、 salmon

一、参考基因组

  1. ## 参考基因组准备:注意参考基因组版本信息
  2. # 下载,Ensembl:http://asia.ensembl.org/index.html
  3. # http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/
  4. # 进入到参考基因组目录
  5. mkdir -p $HOME/database/GRCh38.105
  6. cd $HOME/database/GRCh38.105
  7. # 下载基因组序列axel curl
  8. nohup wget -c http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz >dna.log &
  9. # 下载转录组序列
  10. nohup wget -c http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz >rna.log &
  11. # 下载基因组注释文件
  12. nohup wget -c http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gtf.gz >gtf.log &
  13. nohup wget -c http://ftp.ensembl.org/pub/release-105/gff3/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gff3.gz >gff.log&
  14. # 上述文件下载完整后,再解压;否则文件不完整就解压会报错
  15. # 再次强调,一定要在文件下载完后再进行解压!!!
  16. nohup gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz Homo_sapiens.GRCh38.cdna.all.fa.gz >unzip.log &

二、比对

建索引—比对参考基因组—sam转bam—bam建索引

1、Hisat2

  1. ## ----构建索引
  2. # 进入参考基因组目录
  3. cd $HOME/database/GRCh38.105
  4. # Hisat2构建索引,构建索引时间比较长,建议提交后台运行,一般会运行20多分钟左右
  5. ## 后续索引可直接使用服务器上已经构建好的进行练习
  6. # vim Hisat2Index.sh
  7. mkdir Hisat2Index
  8. fa=Homo_sapiens.GRCh38.dna.primary_assembly.fa
  9. fa_baseName=GRCh38.dna
  10. hisat2-build -p 12 -f ${fa} Hisat2Index/${fa_baseName}
  11. # 运行
  12. nohup sh Hisat2Index.sh >Hisat2Index.sh.log &
  13. ## ----比对
  14. # 进入比对文件夹
  15. cd $HOME/project/Human-16-Asthma-Trans/Mapping/Hisat2
  16. ## 单个样本比对,步骤分解
  17. index=/home/t_rna/database/GRCh38.104/Hisat2Index/GRCh38.dna
  18. inputdir=$HOME/project/Human-16-Asthma-Trans/data/cleandata/trim_galore/
  19. outdir=$HOME/project/Human-16-Asthma-Trans/Mapping/Hisat2
  20. hisat2 -p 10 -x ${index} \
  21. -1 ${inputdir}/SRR1039510_1_val_1.fq.gz \
  22. -2 ${inputdir}/SRR1039510_2_val_2.fq.gz \
  23. -S ${outdir}/SRR1039510.Hisat_aln.sam
  24. # sam转bam
  25. samtools sort -@ 15 -o SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sam
  26. # 对bam建索引
  27. samtools index SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sorted.bam.bai
  28. # 多个样本批量进行比对,排序,建索引
  29. # Hisat.sh内容: 注意命令中的-,表示占位符,表示|管道符前面的输出。
  30. ## 此处索引直接使用服务器上已经构建好的进行练习
  31. # vim Hisat.sh
  32. index=/home/t_rna/database/GRCh38.104/Hisat2Index/GRCh38.dna
  33. inputdir=$HOME/project/Human-16-Asthma-Trans/data/cleandata/trim_galore/
  34. outdir=$HOME/project/Human-16-Asthma-Trans/Mapping/Hisat2
  35. cat ../../data/cleandata/trim_galore/ID | while read id
  36. do
  37. hisat2 -p 5 -x ${index} -1 ${inputdir}/${id}_1_val_1.fq.gz -2 ${inputdir}/${id}_2_val_2.fq.gz 2>${id}.log | samtools sort -@ 3 -o ${outdir}/${id}.Hisat_aln.sorted.bam - && samtools index ${outdir}/${id}.Hisat_aln.sorted.bam ${outdir}/${id}.Hisat_aln.sorted.bam.bai
  38. done
  39. # 统计比对情况
  40. multiqc -o ./ SRR*log
  41. # 提交后台运行
  42. nohup sh Hisat.sh >Hisat.log &

2、subjunc

  1. ## ----构建索引
  2. # 进入参考基因组目录
  3. cd $HOME/database/GRCh38.105
  4. # subjunc构建索引,构建索引时间比较长大约40分钟左右,建议携程sh脚本提交后台运行
  5. ## 后续索引可直接使用服务器上已经构建好的进行练习
  6. # vim SubjuncIndex.sh
  7. mkdir SubjuncIndex
  8. fa=Homo_sapiens.GRCh38.dna.primary_assembly.fa
  9. fa_baseName=GRCh38.dna
  10. subread-buildindex -o SubjuncIndex/${fa_baseName} ${fa}
  11. # 运行
  12. nohup sh SubjuncIndex.sh >SubjuncIndex.sh.log &
  13. ## ----比对
  14. # 进入文件夹
  15. cd $HOME/project/Human-16-Asthma-Trans/Mapping/Subjunc
  16. # vim subjunc.sh
  17. index=/home/t_rna/database/GRCh38.104/SubjuncIndex/GRCh38.dna
  18. inputdir=$HOME/project/Human-16-Asthma-Trans/data/cleandata/trim_galore
  19. outdir=$HOME/project/Human-16-Asthma-Trans/Mapping/Subjunc
  20. cat ../../data/cleandata/trim_galore/ID | while read id
  21. do
  22. subjunc -T 10 -i ${index} -r ${inputdir}/${id}_1_val_1.fq.gz -R ${inputdir}/${id}_2_val_2.fq.gz -o ${outdir}/${id}.Subjunc.bam 1>${outdir}/${id}.Subjunc.log 2>&1 && samtools sort -@ 6 -o ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.bam && samtools index ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.sorted.bam.bai
  23. done
  24. # 运行
  25. nohup sh subjunc.sh >subjunc.log &

3、统计比对结果

# 单个样本
samtools flagstat -@ 3 SRR1039510.Hisat_aln.sorted.bam

# 多个样本,vim flagstat.sh
ls *.sorted.bam | while read id
do
  samtools flagstat -@ 10 ${id} > ${id/bam/flagstat}
done
# 质控
multiqc -o ./  *.flagstat

# 运行
nohup sh flagstat.sh >flagstat.log &

三、定量

1、featureCounts

cd $HOME/project/Human-16-Asthma-Trans/Expression/featureCounts

## 定义输入输出文件夹
gtf=/home/t_rna/database/GRCh38.104/Homo_sapiens.GRCh38.104.chr.gtf.gz
inputdir=$HOME/project/Human-16-Asthma-Trans/Mapping/Hisat2/

# featureCounts对bam文件进行计数
featureCounts -T 6 -p -t exon -g gene_id -a $gtf -o all.id.txt $inputdir/*.sorted.bam

# 对定量结果质控
multiqc all.id.txt.summary

# 得到表达矩阵
# 处理表头,/home/t_rna/要换成自己的路径
less -S all.id.txt |grep -v '#' |cut -f 1,7- |sed 's#/home/t_rna/project/Human-16-Asthma-Trans/Mapping/Hisat2//##g' |sed 's#.Hisat_aln.sorted.bam##g' >raw_counts.txt

sed ///
sed ###


# 列对齐显示
head raw_counts.txt  |column -t

2、salmon

##----构建索引
## 后续索引可直接使用服务器上已经构建好的进行练习
cd $HOME/database/GRCh38.105
nohup salmon index -t Homo_sapiens.GRCh38.cdna.all.fa -i salmon_index >salmon-index.log &
cd $HOME/project/Human-16-Asthma-Trans/Expression/Salmon

##----运行
# 编写脚本,使用salmon批量对目录下所有fastq文件进行定量
# vim salmon.sh
index=/home/t_rna/database/GRCh38.104/salmon_index
input=$HOME/project/Human-16-Asthma-Trans/data/cleandata/trim_galore
outdir=$HOME/project/Human-16-Asthma-Trans/Expression/Salmon
cat ../../data/cleandata/trim_galore/ID |while read id 
do
  salmon quant -i ${index} -l A -1 ${input}/${id}_1_val_1.fq.gz -2 ${input}/${id}_2_val_2.fq.gz -p 5 -o ${outdir}/${id}.quant
done

##----合并表达矩阵
# 原始count值矩阵
# --quants:ls -d *quant |tr '\n' ',' |sed 's/,$//' |awk '{print "{"$0"}"}'
# --quants:ls -d *quant |sed ':a;N;s/\n/,/;t a;'|awk '{print "{"$0"}"}'
# --quants:ls -d *quant |xargs  |tr ' ' ',' |awk '{print "{"$0"}"}'
# --names:ls -d *quant |tr '\n' ',' |sed 's/,$//' |awk '{print "{"$0"}"}' |sed 's/.quant//g'

## 完整版:ls -d *quant |tr '\n' ',' |sed 's/,$//' |awk '{print "{"$0"}"}'  |perl -ne 'chomp;$a=$_;$a=~s/\.quant//g;print"salmon quantmerge --quants $_ --names $a --column numreads  --output raw_count.txt \n";'

salmon quantmerge --quants {SRR1039510.quant,SRR1039511.quant,SRR1039512.quant} --names {SRR1039510,SRR1039511,SRR1039512} --column numreads  --output raw_count.txt

# tpm矩阵
salmon quantmerge --quants {SRR1039510.quant,SRR1039511.quant,SRR1039512.quant} --names {SRR1039510,SRR1039511,SRR1039512} --column tpm  --output tpm.txt

# 后台运行脚本
nohup sh salmon.sh >salmon.log &

代码及图片均来自于生信技能树张娟老师