前言
Crispr基因编辑正越来越广泛的应用在各个方面,包括科研,医疗等等,针对经过筛选的药物靶向基因设计gRNA,使其由原始的基因序列突变为终止密码子,从而无法表达蛋白,进而治疗疾病或者抵抗药物
算法
初步过滤:
- 将12ktrap中,N1-N20内C>T成功的trap提取出来(所有编辑后的reads,不做trap的去重),定义为dataset#1
- 将dataset#1分成2部分,一部分是gRNA在基因的正义链(+)上,定义为dataset#1(+),另一部分在反义链(-)上,定义为dataset#1(-);
正义链的情况(CAG/CGA/CAA > TAG/TGA/TAA)
:
- 找出dataset#1(+)N1-N20中 C>T成功的位置,并往后推2个碱基,判断其是否为CAG或CGA或CAA(编辑前) ,满足其一即抽提出来,完成后定义为dataset#2(+),同时标记出突变位点在gRNA中的位置,譬如N=5, N=6等;
- 将dataset#2(+)中的gRNA(不包含我们人工添加的第一个碱基G)mapping回对应基因的CDS中,数出从ATG(翻译起始位置)的A到gRNA第一个碱基的碱基数M,随后算出 (M-1+N-1)/3, 判断结果是否为整数,如果是整数,判断该基因被stop成功,输出
”TRAP ID + gRNA sequence + strand (+/-) + C>T position + C>T efficiency”
; - 举例说明4)中算法:如N=6, M=11, 则说明突变位点的C距离ATG位置是10+5=15个碱基,即15/3=5个氨基酸,到突变位点(如CAG)恰好是3的整数倍,该基因被stop成功;
反义链的情况(CCA > TCA (TGA) / CTA(TAG) / TTA (TAA))
:
- 找出dataset#1(-)N1-N20中 C>T成功的位置,并判断其是否在CCA这样的motif中(编辑前),如果在,输出trap放到dataset#2(-)中,并标记出CCA motif中A所在gRNA中的位置N,譬如N=5, N=6等;
- 将dataset#2(-)中的gRNA提取出来(此时gRNA的序列应该是和CDS正义链互补的),数出从ATG(翻译起始位置)的A到gRNA第一个碱基的碱基数M (注意此时gRNA为CCNN20,gRNA第一个碱基在最右端),随后算出(M-N)/3是否为整数,如果是整数,判断该基因被stop成功,输出
”TRAP ID + gRNA sequence + strand (+/-) + C>T position + C>T efficiency”
;
需求是:
- 获取每条TRAP序列所对应的stop效率,效率的计算方法是针对一条TRAP来说,至少有一个位点能够stop成功,那么这条对应的read为成功一次,一次类推,既不能重复计数,也不能少计数
- 获取每个位点对应的stop效率,如上原理所述
- 获取基因的stop率:
stop-gene/all-gene
我使用的代码
代码一:此代码用于输出所有通过的stop reads并计算每条TRAP的stop效率
# -*- coding=utf-8 -*-
import re
def complement(seq):
return seq.translate(str.maketrans('ACGT', 'TGCA'))
# Obtain reverse complementary sequence
def revcomp(seq):
return complement(seq)[::-1]
# Input n site, all CDs sequences and trap37bp sequences, and check whether the site can stop
# 1 for success, 0 for failure
def stop_test_for_Z(n, cds_lst, trap):
new_cds_lst = [cds for cds in cds_lst if trap in cds]
score_lst = []
if len(new_cds_lst) > 0:
for haha in new_cds_lst:
m = haha.find(trap) + 1 + 10
score = (m - 1 + n - 1)
score_lst.append(score)
if len([new_score for new_score in score_lst if new_score % 3 == 0]) > 0:
return 1
else:
return 0
else:
return 0
def stop_test_for_F(n, cds_lst, trap):
trap = revcomp(trap)
new_cds_lst = [cds for cds in cds_lst if trap in cds]
score_lst = []
if len(new_cds_lst) > 0:
for haha in new_cds_lst:
m = haha.find(trap) + 1 + 10 + 23
score = (m - n)
score_lst.append(score)
if len([new_score for new_score in score_lst if new_score % 3 == 0]) > 0:
return 1
else:
return 0
else:
return 0
# Read all CDS files and return a dictionary. The key is gene and the value is CDS sequence
def read_cds(fs):
fasta = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
seq_lst = []
name = line.strip("\n").replace(">", "")
else:
seq_lst.append(line.strip("\n"))
fasta[name] = "".join(seq_lst)
new_fasta = {key: value for key,
value in fasta.items() if "unavailable" not in value}
return new_fasta
# Read all the trap sequences and return a dictionary. The key is trap label and the value is 37bp
def read_seq(fs):
fasta = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
seq_lst = []
name = line.strip("\n").replace(">", "")
else:
seq_lst.append(line.strip("\n"))
fasta[name] = seq_lst
return fasta
# Read all reference sequences. The key is label and the value is 37bptrap sequence
def read_ref(fs):
ref = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
name = line.strip("\n").replace(">", "")
else:
ref[name] = line.strip("\n")[118:155]
return ref
# Read the corresponding relationship between gene and label and return to the dictionary
def read_duiying(fs):
duiying = {}
with open(fs, 'r') as f:
for line in f:
lable, gene = line.split("\t")
duiying[lable.strip("\n")] = gene.strip("\n")
return duiying
def motif_Z(trap, site):
if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]:
return 1
else:
return 0
def motif_F(trap, site):
if "CCA" in trap[10:30][site-2: site + 3]:
return 1
else:
return 0
lable_map_gene = read_duiying("../duiying.txt")
all_cds = read_cds("../mart_export.txt")
all_ref = read_ref("ref.fa")
all_trapseq = read_seq("new508.filter.fa")
eff_lst = []
for name, traps in all_ref.items():
gene = lable_map_gene[name]
cds_lst = [vals for keys, vals in all_cds.items() if gene in keys]
pass_lst = []
if name in list(all_trapseq.keys()):
alltrap = len(all_trapseq[name])
seq_lst = all_trapseq[name]
for m in re.finditer("C", traps[10:30]):
if motif_Z(traps, m.start()):
if stop_test_for_Z((m.start() + 1), cds_lst, traps):
while seq_lst:
gg=seq_lst.pop(0)
if gg[10:30][m.start()] == "T":
pass_lst.append(gg)
if len(pass_lst) > 0:
with open("pass_lst_z.fa", 'a') as wocao:
wocao.write(">{}\n".format(name))
for nima in pass_lst:
wocao.write("{}\n".format(nima))
if alltrap > 0:
eff_lst.append([name, len(pass_lst) / alltrap])
else:
print(name)
for name, traps in all_ref.items():
gene = lable_map_gene[name]
cds_lst = [vals for keys, vals in all_cds.items() if gene in keys]
pass_lst = []
if name in list(all_trapseq.keys()):
alltrap = len(all_trapseq[name])
seq_lst2 = all_trapseq[name]
for m in re.finditer("C", traps[10:30]):
if motif_F(traps, m.start()):
if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps):
while seq_lst2:
gg=seq_lst2.pop(0)
if gg[10:30][m.start()] == "T":
pass_lst.append(gg)
if len(pass_lst) > 0:
with open("pass_lst_f.fa", 'a') as wocao:
wocao.write(">{}\n".format(name))
for nima in pass_lst:
wocao.write("{}\n".format(nima))
if alltrap > 0:
eff_lst.append([name, len(pass_lst) / alltrap])
else:
print(name)
with open("trap_stop_eff.txt", 'a') as l:
l.write("trap-lable\ttrap-effective\n")
for lable, effctive in eff_lst:
l.write("{}\t{}\n".format(lable, effctive))
代码二:该代码用于计算需求2
# -*- coding=utf-8 -*-
import re
def complement(seq):
return seq.translate(str.maketrans('ACGT', 'TGCA'))
# 获取反向互补序列
def revcomp(seq):
return complement(seq)[::-1]
# 输入n位点,对应的所有CDS序列以及trap37bp序列,检查该位点能否stop
# 成功返回1,失败返回0
def stop_test_for_Z(n, cds_lst, trap):
new_cds_lst = [cds for cds in cds_lst if trap in cds]
score_lst = []
if len(new_cds_lst) > 0:
for haha in new_cds_lst:
m = haha.find(trap) + 1 + 10
score = (m - 1 + n - 1)
score_lst.append(score)
if len([new_score for new_score in score_lst if score % 3 == 0]) > 0:
return 1
else:
return 0
else:
return 0
def stop_test_for_F(n, cds_lst, trap):
trap = revcomp(trap)
new_cds_lst = [cds for cds in cds_lst if trap in cds]
score_lst = []
if len(new_cds_lst) > 0:
for haha in new_cds_lst:
m = haha.find(trap) + 1 + 10 + 23
score = (m - n)
score_lst.append(score)
if len([new_score for new_score in score_lst if score % 3 == 0]) > 0:
return 1
else:
return 0
else:
return 0
# 读取所有的cds文件返回一个字典,键为gene,值为cds序列
def read_cds(fs):
fasta = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
seq_lst = []
name = line.strip("\n").replace(">", "")
else:
seq_lst.append(line.strip("\n"))
fasta[name] = "".join(seq_lst)
new_fasta = {key: value for key,
value in fasta.items() if "unavailable" not in value}
return new_fasta
# 读取所有的trap序列返回一个字典,键为trap-lable,值为37bp列表
def read_seq(fs):
fasta = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
seq_lst = []
name = line.strip("\n").replace(">", "")
else:
seq_lst.append(line.strip("\n"))
fasta[name] = seq_lst
return fasta
# 读取所有的参考序列,键为lable,值为37bptrap序列
def read_ref(fs):
ref = {}
with open(fs, 'r') as f:
for line in f:
if line.startswith(">"):
name = line.strip("\n").replace(">", "")
else:
ref[name] = line.strip("\n")[118:155]
return ref
# 读取基因和lable的对应关系,返回字典
def read_duiying(fs):
duiying = {}
with open(fs, 'r') as f:
for line in f:
lable, gene = line.split("\t")
duiying[lable.strip("\n")] = gene.strip("\n")
return duiying
def motif_Z(trap, site):
if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]:
return 1
else:
return 0
def motif_F(trap, site):
if "CCA" in trap[10:30][site-2: site + 3]:
return 1
else:
return 0
lable_map_gene = read_duiying("../duiying.txt")
all_cds = read_cds("../mart_export.txt")
all_ref = read_ref("ref.fa")
all_trapseq = read_seq("new508.filter.fa")
with open("pass_lst_f.txt", 'a') as wocao:
wocao.write(
"TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n")
with open("pass_lst_z.txt", 'a') as wocao:
wocao.write(
"TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n")
for name, traps in all_ref.items():
gene = lable_map_gene[name]
cds_lst = [vals for keys, vals in all_cds.items() if gene in keys]
pass_lst = []
for m in re.finditer("C", traps[10:30]):
if motif_Z(traps, m.start()):
if stop_test_for_Z((m.start() + 1), cds_lst, traps) and name in list(all_trapseq.keys()):
i = 0
for gg in all_trapseq[name]:
if gg[10:30][m.start()] == "T":
i = i + 1
eff = i/len(all_trapseq[name])
pass_lst.append(
[name, traps[10:33], "+", (m.start() + 1), eff])
if len(pass_lst) > 0:
with open("pass_lst_z.txt", 'a') as wocao:
for nima in pass_lst:
[trap_id, trap_seq, strand, ctposition, efftion] = nima
wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
trap_id, gene, trap_seq, strand, ctposition, efftion))
for name, traps in all_ref.items():
gene = lable_map_gene[name]
cds_lst = [vals for keys, vals in all_cds.items() if gene in keys]
pass_lst = []
for m in re.finditer("C", traps[10:30]):
if motif_F(traps, m.start()):
if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps) and name in list(all_trapseq.keys()):
j = 0
for gg in all_trapseq[name]:
if gg[10:30][m.start()] == "T":
j = j + 1
eff = j/len(all_trapseq[name])
pass_lst.append(
[name, traps[10:33], "-", (m.start() + 1), eff])
if len(pass_lst) > 0:
with open("pass_lst_f.txt", 'a') as wocao:
for nima in pass_lst:
[trap_id, trap_seq, strand, ctposition, efftion] = nima
wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
trap_id, gene, trap_seq, strand, ctposition, efftion))
该博客仅记录工作流程,以作为备忘录,请勿过分借鉴,也不要转载,感谢!