以下的突变检测都是基于NGS测序的结果,但是需要预先使用bwa
等工具预先比对到参考序列或者基因组上,
然后根据要研究的目标区域提取比对上的序列作进一步的分析,一个有用的方式就是使用samtools
.
samtools view -b -F chrN:NNNNN-NNNNN > contig_1.bam
因为提取出来的都是短序列,因此,我们可以在下游使用全局比对(Needleman-Wunsch)来获取比对详情。
经过简单的处理,我将在bam文件中提取比对到参考序列上的短序列并存入Fasta格式
query:
>10613
CATTGAGATGGTGTGGGAAGGGGCCC
CATTGAGATAGTGTGGCCCGTTTGAAGGGGCCC
CATTGAGATAGAAGGGGCCC
CATTGAGATAGAAGGGGCCC
CATTGAGATAGTGTGG
CATTGAGATAGGCCC
CATTGAGATAGTGTGGGCCC
CATTGAGATAGTGTGGGAAGGGGCCC
CATTGAGATAGTGTGGGAAGGGGCCC
CATTGAGATAGTGTGGGCCC
CATTGAGATAGTGTGGGAAGGGGCCC
CATTGAGATAGTGTGGGAAGGGGCCC
CATTGAGATAGTGTGGGCCC
CATTGAGATAGTGCCC
CATTGGGCCC
CATTGAGATAGGGCCC
CATTGAGATATGTGGGGAAGGGGCCC
'''''
ref:
CATTGAGATAGTGTGGGGAAGGGGCCC
突变检测
目标:统计出indel,snp等常见突变的位置,突变量(nbp突变)以及突变频率
需要借助python多序列比对模块,这里我们采用github上的一个
##导入所有需要的模块
from align.calign import aligner as caligner
from align.matrix import DNAFULL, BLOSUM62
from collections import Counter
import re
from pyfaidx import Fasta
import pandas as pd
import numpy as np
##定义读入fasta序列的函数
def read_res(fs):
fasta={}
with open(fs,'r') as f:
for line in f:
if line.startswith(">"):
seq_lst=[]
name=line.strip("\n").replace(">","")
else:
seq_lst.append(line.strip("\n"))
fasta[name]=seq_lst
return fasta
###采用正则表达式来提取突变
def clean_insertion(ref,query):
ins_lst=[]
delet=[]
for site in re.finditer(r"-+",ref):
(start,end)=site.span()
length=end-start
ins_lst.append("{}I{}".format(start+1,length))
delet.append(list(range(start,end)))
delet_2=[y for x in delet for y in x]
query="".join([query[i] for i in range(len(query)) if i not in delet_2])
ref=ref.replace("-","")
ins_lst2=";".join(ins_lst)
return (ref,query,ins_lst2)
def detective_del(ref,query):
deletion=[]
for st in re.finditer(r"-+",query):
(start,end)=st.span()
length=end-start
deletion.append("{}D{}".format(start+1,length))
deletion2=";".join(deletion)
return (ref,query,deletion2)
def detective_snp(ref,query):
snp_lst=["{}M1".format(st+1) for st in range(len(ref)) if (query[st]!=ref[st])&(query[st]!="-")&(ref[st]!="-")]
snp=";".join(snp_lst)
return (ref,query,snp)
def crispr_det(querys,ref):
query_lst=Counter(querys)
alreads=sum(list(query_lst.values()))
fin=[]
for key,val in query_lst.items():
bijiao=caligner(ref,key,method="global",matrix=DNAFULL)
(ref,query,insertions)=clean_insertion(bijiao[0].seq1.decode(),bijiao[0].seq2.decode())
(ref,query,deletions)=detective_del(ref,query)
(ref,query,mutations)=detective_snp(ref,query)
tmp={"reference":ref,"query":query,"ins":insertions,"del":deletions,"mut":mutations,"percentage":"{:.2f}".format(val/alreads*100)}
fin.append(tmp)
return fin
##读入query&ref
test=read_res("query.fa")
reference = Fasta("reference.fasta")
querys=test['10613']
ref=reference["10613"][113:140].seq ## 请查阅 pyfaidx 模块介绍
df=pd.DataFrame(crispr_det(querys,ref))
df['percentage']=df['percentage'].astype(np.float64)
df=df.sort_values('percentage',axis=0,ascending=False)
df.head(10)
运行完后可以查看最终获得的结果:
reference | query | ins | del | mut | percentage | |
---|---|---|---|---|---|---|
6 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAGTGTGGG-AAGGGGCCC | 18D1 | 16.53 | ||
5 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAGTGTGGG———-CCC | 18D7 | 10.93 | ||
10 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATA-TGTGGGGAAGGGGCCC | 11D1 | 10.10 | ||
30 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAGTGTGGGG———CCC | 19D6 | 7.60 | ||
2 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAG———-AAGGGGCCC | 12D7 | 5.34 | ||
14 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAGTGTGG————CCC | 17D8 | 4.42 | ||
3 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAGTGTGG—————- | 17D11 | 3.67 | ||
9 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGATAG-G-G————-CCC | 12D1;14D1;16D9 | 3.09 | ||
13 | CATTGAGATAGTGTGGGGAAGGGGCCC | CATTGAGA-AG-G-GG————CCC | 9D1;12D1;14D1;17D8 | 2.92 | ||
18 | CATTGAGATAGTGTGGGGAAGGGGCCC | C—-GA-A—G-G-GG————CCC | 2D3;7D1;9D2;12D1;14D1;17D8 | 2.67 |
对以上结果的可视化展示
因为R语言画图是很方便的,所以这里采用R语言热图展示突变情况
##数据处理
data<-read_delim("ontarget6.mutation.tsv",delim = "\t")
ref<-stringr::str_split_fixed(data$reference[1],"",n=27)
ref<-as.character(ref)
need<-data%>%filter(percentage>=1)%>%tidyr::replace_na(replace = list(ins="-",del="-",mut="-"))
plotData<-stringr::str_split_fixed(need$query,"",n=27)
gRNA<-c(rep("1",23),rep("2",4))
##画
Heatmap(plotData,
cluster_rows = FALSE,
cluster_columns = FALSE,
column_split = c(rep("1",17),rep("2",10)),
column_title = NULL,
col = c("A"="#d1e8cd","G"="#ffffda","C"="#fae6d1","T"="#e4daec","-"="#e8e9ea"),
cell_fun = function(j,i,x,y,wt,ht,fill){
if(plotData[i,j]!=ref[j]){
grid.text(sprintf("%s", plotData[i, j]), x, y, gp = gpar(fontsize = 10,fontface="bold"))
}else{
grid.text(sprintf("%s", plotData[i, j]), x, y, gp = gpar(fontsize = 10))
}
},
top_annotation = HeatmapAnnotation(ref=anno_simple(ref,pch=ref,col = c("A"="#d1e8cd","G"="#ffffda","C"="#fae6d1","T"="#e4daec")),
sgRNA=anno_simple(gRNA,col = c("1"="grey","2"="white")),
annotation_name_gp = gpar(fontsize=10,fontface="bold")),
right_annotation = rowAnnotation(percentage=anno_text(sprintf("%s%%",need$percentage),gp=gpar(fontsize=10,fontface="bold")),
insertion=anno_text(need$ins,gp=gpar(fontsize=10,fontface="bold")),
deletion=anno_text(need$del,gp=gpar(fontsize=10,fontface="bold")),
substitutions=anno_text(need$mut,gp=gpar(fontsize=10,fontface="bold")),
gap=unit(2,'char')),
show_heatmap_legend = FALSE
)