Compute the Number of Times a Pattern Appears in a Textoverlap

rosalind.info_problems_ba1a_ (2).png

1、暴力:O(|t||p|)

如题干的算法伪代码所示:

  1. def patternCount(text: str, pattern: str) -> int:
  2. """Count overlapping substring in a given string"""
  3. m, n = len(text), len(pattern)
  4. count = 0
  5. for i in range(0, m - n + 1):
  6. if text[i:i+n] == pattern:
  7. count += 1
  8. return count
  9. s = """GCGCG
  10. GCG
  11. """
  12. print(patternCount(*s.splitlines()))
  1. pub fn count_pattern(text: &str, pattern: &str) -> u32 {
  2. let mut count = 0;
  3. let num = pattern.len();
  4. let length = text.len();
  5. // println!("Length: {}/{}", length, num);
  6. for (idx, _) in text.chars().enumerate() {
  7. let max_len = idx + num;
  8. if max_len < length {
  9. match &text[idx..(idx + num)] == pattern {
  10. true => count = count + 1,
  11. _ => continue,
  12. }
  13. }
  14. }
  15. return count;
  16. }

2、strstr

28. 实现 strStr()

  1. public class KMP {
  2. public static void main(String[] args) {
  3. int[] ababas = kmp ("ab");
  4. System.out.println (Arrays.toString (ababas));
  5. int i = kmpSearch ("abababababa", "aba", ababas);
  6. System.out.println (i);
  7. }
  8. public static int[] kmp(String s){ //AAADAAAADDA
  9. int[] temp = new int[s.length ()];
  10. temp[0] = 0;
  11. for (int i = 0,j=1; j < s.length (); j++) {
  12. while (i>0&&s.charAt (i)!=s.charAt (j)){
  13. i = temp[i-1];
  14. }
  15. if (s.charAt (i)==s.charAt (j)) {
  16. i++;
  17. }
  18. temp[j] = i;
  19. }
  20. return temp;
  21. }
  22. public static int kmpSearch(String str1,String str2,int[] next){
  23. //遍历str1
  24. int ans = 0;
  25. for (int i = 0,j = 0; i < str1.length (); i++) {
  26. while (j>0&&str1.charAt (i)!=str2.charAt (j)){
  27. j = next[j-1];
  28. }
  29. if (str1.charAt (i)==str2.charAt (j)){
  30. j++;
  31. }
  32. if (j==str2.length ()&&j>=2){
  33. ans++;
  34. j = next[j-2];
  35. i--;
  36. }
  37. }
  38. return ans;
  39. }
  40. }
  41. 作者:Condescending ColdenjK1
  42. 链接:https://leetcode.cn/circle/discuss/Cqoohu/view/FHApDf/
  43. 来源:力扣(LeetCode
  44. 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

3、SA+Height

后缀数组+height数组,将t#s连起来后得后缀数组,排序获得height,然后找s出现位置。

Find the Most Frequent Words in a Stringoverlap

rosalind.info_problems_ba1b_.png

1、暴力:O(|t|2|k|)

枚举1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图3中所有长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图4的子串作为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图5,然后调用1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图6计算该串重叠出现次数。注意:由于1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图7中所有长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图8的子串可能有重复,因此若它们出现次数都是最大,最终输出要去重。实现上就是用集合记录最频繁出现的1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图9

  1. from typing import List
  2. def patternCount(text: str, pattern: str) -> int:
  3. """Count overlapping substring in a given string"""
  4. m, n = len(text), len(pattern)
  5. count = 0
  6. for i in range(0, m - n + 1):
  7. if text[i:i+n] == pattern:
  8. count += 1
  9. return count
  10. def mostFrequentWord(text: str, k: int) -> List[str]:
  11. """Find the Most Frequent Words in a String"""
  12. ans, cnt = {}, 0 # pattern may be repeat, so we use set to deduplicate.
  13. for i in range(len(text) - k + 1):
  14. # enumerate all k-mer substring
  15. pattern = text[i:i+k]
  16. count = patternCount(text, pattern)
  17. if cnt < count:
  18. ans = {pattern}
  19. cnt = count
  20. elif cnt == count:
  21. ans.add(pattern)
  22. return list(ans)
  23. s = """ACGTTGCATGTCGCATGATGCATGAGAGCT
  24. 4
  25. """
  26. text, k = s.splitlines()
  27. print(*mostFrequentWord(text, int(k)))
  1. dna <- ("ACGTTGCATGTCGCATGATGCATGAGAGCT")
  2. n <- 4
  3. kmer.list <- numeric(nchar(dna)-(n-1))
  4. for (i in 1:(nchar(dna)-(n-1))){
  5. kmer.list[i] <- substring(dna, i, (i+(n-1)))
  6. }
  7. kmer.list <- as.matrix(subset(kmer.list, kmer.list > 0))
  8. kmer.list <- as.matrix(sort(table(kmer.list), decreasing = TRUE))
  9. frequent_words <- function(text, k) {
  10. frequentpatterns <- matrix()
  11. textunlisted <- unlist(strsplit(text, split=""))
  12. n <- nchar(text)
  13. for (i in 1:(n-k+1)) {
  14. pattern <- paste(textunlisted[i:(i+k-1)], collapse="")
  15. count[[i]] <- matrix(pattern_count(text,pattern))
  16. }
  17. maxcount <- max(count)
  18. for (i in 1:(n-k+1)){
  19. if(count[[i]] == maxcount)
  20. frequentpatterns[[i]] <- matrix(paste(textunlisted[i:(i+k-1)], collapse=""))
  21. }
  22. frequentpatterns <- unique(frequentpatterns)
  23. frequentpatterns <- frequentpatterns[!is.na(frequentpatterns)]
  24. return(frequentpatterns)
  25. }

2、k长度滑窗+滚动哈希+计数器:O(|t|+|k|)

  1. from typing import List
  2. from collections import Counter
  3. def mostFrequentWord(text: str, k: int) -> List[str]:
  4. """Find the Most Frequent Words in a String"""
  5. cnt = Counter(text[i:i+k] for i in range(len(text) - k + 1))
  6. maxCnt = max(cnt.values())
  7. return [s for s, v in cnt.items() if v == maxCnt]
  8. s = """ACGTTGCATGTCGCATGATGCATGAGAGCT
  9. 4
  10. """
  11. text, k = s.splitlines()
  12. print(*mostFrequentWord(text, int(k)))

哈希表可以通过 排序+双指针 实现计数

3、SA

Explanation

rosalind.info_problems_ba1b_explanation_.png

Find the Reverse Complement of a String

rosalind.info_problems_ba1c_.png

1、模拟

  1. def reverseComplementOfDNA(dna: str) -> str:
  2. """Find the Reverse Complement of a String"""
  3. # return dna.translate(str.maketrans('AGCT', 'TCGA'))[::-1]
  4. d = dict(zip('AGCT', 'TCGA'))
  5. n = len(dna)
  6. ret = [''] * n
  7. for i in range(n):
  8. ret[n-1-i] = d[dna[i]]
  9. return "".join(ret)
  10. print(reverseComplementOfDNA("AAAACCCGGT"))

Explanation

rosalind.info_problems_ba1c_explanation_.png

Find All Occurrences of a Pattern in a Stringoverlap

rosalind.info_problems_ba1d_.png

1、KMP找出所有匹配位置

28. 实现 strStr()

  1. from typing import List
  2. def strstr(t: str, p: str) -> List[int]:
  3. """Find all the occurence place of p in t"""
  4. m, n = len(t), len(p)
  5. if n == 0:
  6. return [0]
  7. # calculate next array
  8. next = [0] * n
  9. i, j = 1, 0
  10. while i < n:
  11. while j > 0 and p[i] != p[j]:
  12. j = next[j-1]
  13. if p[i] == p[j]:
  14. j += 1
  15. next[i] = j
  16. i += 1
  17. # print(next)
  18. # KMP
  19. idxs = []
  20. i, j = 0, 0
  21. while i < m:
  22. while j > 0 and t[i] != p[j]:
  23. j = next[j-1]
  24. if t[i] == p[j]:
  25. j += 1
  26. if j == n:
  27. # we can return i - n + 1 if find the first occurence place,
  28. # otherwise assume t[i] != p[n-1]
  29. idxs.append(i - n + 1)
  30. j = next[n-1]
  31. i += 1
  32. # print(idxs)
  33. return idxs
  34. s = """ATAT
  35. GATATATGCATATACTT
  36. """
  37. p, t = s.splitlines()
  38. print(*strstr(t, p))
  1. fn main() {
  2. let pattern = "ATAT";
  3. let genome = String::from("GATATATGCATATACTT");
  4. let positions = find_pattern_positions(pattern, genome) ;
  5. for pos in positions {
  6. print!("{} ",pos)
  7. }
  8. }
  9. fn find_pattern_positions(pattern: &str, genome:String) -> Vec<u32> {
  10. let mut positions: Vec<u32> = Vec::new();
  11. for i in 0..genome.len()-pattern.len(){
  12. if genome[i..i+pattern.len()] == *pattern {
  13. positions.push(i as u32);
  14. }
  15. }
  16. positions
  17. }


Find Patterns Forming Clumps in a String

rosalind.info_problems_ba1e_.png

1、暴力:O(L2k|Genome|)

  1. def FrequencyTable(text, k):
  2. freq_map = {}
  3. n = len(text)
  4. for i in range(0, n - k + 1):
  5. pattern = text[i:i + k]
  6. if pattern not in freq_map:
  7. freq_map[pattern] = 1
  8. else:
  9. freq_map[pattern] = freq_map[pattern] + 1
  10. return freq_map
  11. def FindClumps(text, k, l, t):
  12. patterns = set()
  13. n = len(text)
  14. for i in range(n - l + 1):
  15. window = text[i:i + l]
  16. freq_map = FrequencyTable(window, k)
  17. for s in freq_map:
  18. if freq_map[s] >= t:
  19. patterns.add(s)
  20. return ' '.join(patterns)
  21. filename = 'rosalind_ba1e.txt'
  22. file = open(filename, 'r')
  23. lines = file.readlines()
  24. text = lines[0].strip()
  25. params = lines[1].strip().split()
  26. k, l, t = [int(p) for p in params]
  27. print(FindClumps(text, k, l, t))

2、所有k-mer出现位置+第i个出现位置到第i+t个出现位置的距离是否≤L

区间段覆盖问题

  1. from typing import List
  2. def clumpFind(genome: str, k: int, L: int, t: int) -> List[str]:
  3. # O(nk): enumerate k-mer in genome
  4. n = len(genome)
  5. kmerIdxs = {} # {k-mer: indexs}
  6. for i in range(n - k + 1):
  7. p = genome[i:i+k] # can be optimize with rolling hash
  8. # add the occurrence of p in Genome
  9. kmerIdxs.setdefault(p, []).append(i)
  10. ans = []
  11. # 时间复杂度较难计算,但<<O(n^2)
  12. # 对每个k-mer出现位置,看L长度内有多少个(滑窗O(|idx|)解决)
  13. # if the number overlap occurence idx >= t
  14. for s, idx in kmerIdxs.items():
  15. left = 0
  16. for right in range(len(idx)):
  17. # slide in idx[right]
  18. # check the length of idx[left]~idx[right]
  19. while left <= right and idx[right] - idx[left] + 1 > L:
  20. left += 1
  21. if right - left + 1 >= t:
  22. ans.append(s)
  23. break
  24. return ans
  25. s = """CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC
  26. 5 75 4
  27. """
  28. lines = s.splitlines()
  29. genome = lines[0]
  30. k, L, t = map(int, lines[1].split())
  31. print(*clumpFind(genome, k, L, t))

3、评论区最优解法:长度为L的滑窗(初始化+滑入/出一个k-mer)

rosalind.info_problems_ba1e_solutions_ (1).png
本题可以overlap,左侧滑出1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图16,那么起点为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图17的 k-mer 数少一,滑入1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图18,那么起点为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图19的 k-mer 数加一,窗口内的所有起点1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图20的 k-mer 数不变,因此每次滑动只需要判断新进来的起点为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图21的 k-mer 数是否 ≥ t
总时间复杂度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图22

  • 初始化滑窗:枚举长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图23窗口内所有长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图24的子串需要1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图25,可用滚动哈希优化为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图26
  • 每次滑动转移需要1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图27计算滑入/出的k-mer串的哈希值,时间复杂度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图28 :::info 进阶:非overlap
    30. 串联所有单词的子串,时间复杂度为O(k×|G|k) :::

    Explanation

    rosalind.info_problems_ba1e_explanation_.png

    Find a Position in a Genome Minimizing the Skew

    rosalind.info_problems_ba1f_.png

    1、G,C目前累计数量->G-C差值

    ```python from typing import List

def minimumSkew(dna: str) -> List[int]: n = len(dna) minv, ans = n, [] G = C = 0 for i in range(n): if dna[i] == ‘G’: G += 1 elif dna[i] == ‘C’: C += 1 if G - C < minv: ans = [i + 1] # next place is oriC minv = G - C elif G - C == minv: ans.append(i + 1) # next place is oriC return ans

s = “””CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG”””

print(*minimumSkew(s))

  1. <a name="CkIoo"></a>
  2. # [Compute the Hamming Distance Between Two Strings](https://rosalind.info/problems/ba1g/)
  3. ![rosalind.info_problems_ba1g_.png](https://cdn.nlark.com/yuque/0/2022/png/1438957/1652190714036-2612c61f-beab-4f84-8a24-73b2cf68978f.png#clientId=u4f4af40e-e31d-4&crop=0&crop=0&crop=1&crop=1&from=drop&id=ubc71f5fb&margin=%5Bobject%20Object%5D&name=rosalind.info_problems_ba1g_.png&originHeight=654&originWidth=1175&originalType=binary&ratio=1&rotation=0&showTitle=false&size=46271&status=done&style=shadow&taskId=u0dd8af46-763b-46fc-88b4-459e8dbab90&title=)
  4. <a name="YcT8P"></a>
  5. ## 1、汉明距离:统计每位s[i]!=t[i]总数
  6. ```python
  7. def hammingDistance(s: str, t: str) -> int:
  8. dist = 0
  9. for ch1, ch2 in zip(s, t):
  10. if ch1 != ch2:
  11. dist += 1
  12. return dist
  13. s = """GGGCCGTTGGT
  14. GGACCGTTGAC"""
  15. s, t = s.splitlines()
  16. print(hammingDistance(s, t))

Find All Approximate Occurrences of a Pattern in a String

rosalind.info_problems_ba1h_.png

1、暴力解:O(|T||P|)

枚举text中所有长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图32的子串1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图33, 检验其与1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图34的汉明距离是否小于等于d,是就将索引1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图35加入答案。

  1. from typing import List
  2. def hammingDistance(s: str, t: str) -> int:
  3. dist = 0
  4. for ch1, ch2 in zip(s, t):
  5. if ch1 != ch2:
  6. dist += 1
  7. return dist
  8. def approximatePatternMatch(text: str, pattern: str, d: int) -> List[int]:
  9. m, n = len(text), len(pattern)
  10. ans = []
  11. for i in range(m - n + 1):
  12. if hammingDistance(text[i:i+n], pattern) <= d:
  13. ans.append(i)
  14. return ans
  15. s = """ATTCTGGA
  16. CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC
  17. 3
  18. """
  19. pattern, text, d = s.splitlines()
  20. print(*approximatePatternMatch(text, pattern, int(d)))

2、待更新DP

允许错配
https://en.wikipedia.org/wiki/Approximate_string_matching
https://www.cs.unc.edu/~prins/Classes/555/Media/Lec18.pdf
测试用例:
https://bioinformaticsalgorithms.com/data/debugdatasets/replication/ApproximatePatternMatchingProblem.pdf

Generate the d-Neighborhood of a StringBFS/DFS

rosalind.info_problems_ba1n_.png

1、枚举所有长度为|Pattern|的组成可能+汉明距离筛选

最直观的方式就是生成1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图37个所有可能序列,然后在它们中筛出与1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图38的汉明距离≤d的构成答案。

  1. def get_d_neighbors(pattern: str, d: int) -> str:
  2. return "\n".join(
  3. [
  4. x
  5. for x in (
  6. "".join(i)
  7. for i in itertools.product(["A", "T", "G", "C"], repeat=len(pattern))
  8. )
  9. if hamming_distance(pattern, x) <= d
  10. ]
  11. )

复杂度分析

  • 时间复杂度1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图39,一次汉明距离筛选需要1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图40,总共进行1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图41
  • 空间复杂度1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图42

    2、记录枚举树中所有节点

    :::info 书本上代码允许一个位置1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图43突变多次,且允许1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图44突变为自身,因此可能出现s[i]->...->s[i],导致产生很多重复集,最终结果需要使用集合去重。
    如果我们限制1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图45最多突变一次且只能突变为非1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图46字符,就能保证每次突变产生不同结果。 ::: 如下图s = ACG, d = 2的枚举树,层序号就是与s的汉明距离,例如第1层表示其与s距离为1。其中淡蓝色的节点是重复出现,去重方式就是保证突变位置递增,因此所求答案就是所有非重复树节点。
    标记重复.svg
    复杂度分析:设1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图48长度为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图49,字符集1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图50,汉明距离差为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图51

  • 时间复杂度1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图52,分别枚举突变次数从1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图53,第1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图54次突变可发生在1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图55个位置之一,该次突变结果有1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图56种(不允许变成自己),第1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图57次突变可发生在1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图58个位置之一(不允许一个位置突变多次),该次突变结果有1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图59种(不允许变成自己),因此突变数为1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图60时,产生1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图61种结果。

  • 空间复杂度1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图62

    实现1:DFS/回溯

    评论区的DFS算法,其去重方式要求每次修改位置必须是递增,因此其枚举树就是上图 ```python def Neighbors(Pattern, k, d, start = 0): neighbors = [Pattern] # 初始化把自己加入 if d > 0:
    1. for i in range(start, k):
    2. for symbol in "ACGT":
    3. if symbol != Pattern[i]:
    4. new_pattern = Pattern[:i] + symbol + Pattern[i+1:]
    5. new_neighbors = Neighbors(new_pattern, k, d - 1, i + 1) # 注意i+1
    6. neighbors.extend(new_neighbors)
    return neighbors

kmer = input() d = int(input()) neighbors = Neighbors(kmer, len(kmer), d) for neighbor in neighbors: print(neighbor)

  1. 01选取:对于每个位置![](https://cdn.nlark.com/yuque/__latex/b355b418b4897ebe9ef04778df03a3b1.svg#card=math&code=s%5Bi%5D&id=d9BD3)有两种选择,突变或不突变:
  2. - 不突变:那么总突变数![](https://cdn.nlark.com/yuque/__latex/56c1b0cb7a48ccf9520b0adb3c8cb2e8.svg#card=math&code=d&id=TRDqr)不变,继续向后求解![](https://cdn.nlark.com/yuque/__latex/5195e3a1c22eddbccb14d553c0c35622.svg#card=math&code=s%5Bi%2B1%2C%5D&id=dk8OT)中总突变数为![](https://cdn.nlark.com/yuque/__latex/56c1b0cb7a48ccf9520b0adb3c8cb2e8.svg#card=math&code=d&id=e2fF6)的结果
  3. - 突变:前提是![](https://cdn.nlark.com/yuque/__latex/c9e1e0c625406b78101eb07fd0d7c453.svg#card=math&code=d%5Cgt%200&id=x7GJk),当前突变结果有三种(下面代码规定![](https://cdn.nlark.com/yuque/__latex/b355b418b4897ebe9ef04778df03a3b1.svg#card=math&code=s%5Bi%5D&id=xezfv)只允许一次有效突变:![](https://cdn.nlark.com/yuque/__latex/b355b418b4897ebe9ef04778df03a3b1.svg#card=math&code=s%5Bi%5D&id=dyf50)不能突变两次或突变一次变回![](https://cdn.nlark.com/yuque/__latex/b355b418b4897ebe9ef04778df03a3b1.svg#card=math&code=s%5Bi%5D&id=JTMsC)),每种使用回溯选择一下,继续向后求解![](https://cdn.nlark.com/yuque/__latex/5195e3a1c22eddbccb14d553c0c35622.svg#card=math&code=s%5Bi%2B1%2C%5D&id=ixVcZ)中总突变数为![](https://cdn.nlark.com/yuque/__latex/e6ecc3ef1343a9b617b11e1dcb6470c5.svg#card=math&code=d-1&id=agpIO)的结果
  4. ```python
  5. from typing import List
  6. def neighbors(pattern: str, d: int) -> List[str]:
  7. s = list(pattern) # 转数组可修改
  8. ans = [] # 回溯生成不重复,每个s[i]只能变为字符集内其它字符
  9. def backtrace(idx: int, d: int) -> None: # 去重一:保证idx递增
  10. # 递归出口
  11. if idx >= len(s) or d == 0:
  12. ans.append("".join(s))
  13. return
  14. # 当前选择不变
  15. backtrace(idx + 1, d)
  16. # 当前选择突变
  17. if d > 0:
  18. orginBase = s[idx]
  19. for base in "ACGT":
  20. if base != orginBase: # 去重二:保证s[i]变为其它字符
  21. s[idx] = base # 选择
  22. backtrace(idx + 1, d - 1)
  23. s[idx] = orginBase # 撤销
  24. backtrace(0, d)
  25. # print(len(ans) == len(set(ans)))
  26. return ans
  27. s = """ACG
  28. 1
  29. """
  30. s, d = s.splitlines()
  31. print(*neighbors(s, int(d)), sep='\n')

实现2:BFS【从src向下搜d层】

从上往下一层层搜索,但是怎么去重呢?

  1. def mutation(s, d):
  2. ans = set()
  3. queue = []
  4. dist = {}
  5. queue.append(s)
  6. # 此dist字典没必要,其就是记录BFS层数,使用layer变量记录即可
  7. dist[s] = 0 # d[t]表示s->...->t共花费几步(允许同位置变多次+变为自己)
  8. ans.add(s) # 使用集合去重
  9. while len(queue) != 0:
  10. cur = queue.pop(0) # 可以改为collections.deque
  11. for i in range(len(cur)):
  12. cur_l = list(cur)
  13. for j in bp: # 允许s[i]→s[i],因此会产生重复解
  14. cur_l[i] = j
  15. new_str = "".join(cur_l)
  16. #print(new_str)
  17. # 使用ans标记重复,防止重复节点搜索,也就是s[i]只能变为非s[i]字符
  18. if new_str not in ans and dist[cur] + 1 <= d:
  19. ans.add(new_str)
  20. queue.append(new_str)
  21. dist[new_str] = dist[cur] + 1
  22. return ans

上面代码使用1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图63来标记访问过的节点,这样可以保证不重复,即最终所求答案,但是dist字典根本没用
我们可以使用每层突变位置必须是递增的方式来去重,也就是节点设计为(src, mutPos),下一个节点突变位置只能在mutPos + 1 -> |src|-1

  1. from typing import List
  2. def neighbors(pattern: str, d: int) -> List[str]:
  3. ans = [pattern] # d=0将自己加入
  4. from collections import deque
  5. q = deque([(pattern, -1)])
  6. while d:
  7. for _ in range(len(q)):
  8. u, mutPos = q.popleft()
  9. # 突变位置递增且s[i]只能变为字符集内其它字符可保证生成结果不重复
  10. for i in range(mutPos + 1, len(u)):
  11. for base in "ACGT":
  12. if u[i] != base:
  13. v = u[:i] + base + u[i+1:]
  14. ans.append(v) # 记录入队节点
  15. q.append((v, i))
  16. # print("next layer: ", q)
  17. d -= 1
  18. # print(len(ans) == len(set(ans)))
  19. return ans
  20. s = """ACG
  21. 2
  22. """
  23. s, d = s.splitlines()
  24. print(*neighbors(s, int(d)), sep='\n')

Explanation

rosalind.info_problems_ba1n_explanation_.png
书本上的代码是允许1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图65多次突变+允许1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图66突变回1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图67,这样就会出现大量重复解,因此需要用集合去重。

Implement PatternToNumber字符串转对应四进制数

rosalind.info_problems_ba1l_.png

1、分支界定法:x是字典序第几(从0开始计数)

AGT排第几?前缀AG位于第二层且序号为2,即左侧有2个兄弟节点,它们每个可向下生长N=4个分支,因此AG的第一个孩子序号为2 * N,而T是AG的第4个字典序孩子,因此其序号为2 * N + 3
AGTG排第几?前缀AGT位于第三层且序号在为11,即左侧有11个兄弟节点,它们每个可向下生长N=4个分支,因此共AGT的的第一个孩子序号为11 * N,而G是AGT的第3个字典序孩子,因此其序号为11 * N + 2
agct.svg
综上有公式:
当前串的序号 = prefix的序号 分支个数N + 当前字符ch的字典序
那么prefix串的序号怎么求呢?其可以继续带入上面公式,因此这个是递归问题。
递归函数定义1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图70返回1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图71串在由字符集1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图72构成的字典序顺序(从0开始计数)
递归出口:当1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图73时,直接看该字符在1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图74排第几返回;实际也可将1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图75时返回1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图76当作出口,因为其不影响结果。
递归链条:若1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图77,拆分除前缀1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图78和末尾字符1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图79,当前1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图80的序号 = 1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图81序号
分支个数N + 当前字符1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图82的字典序

  1. symbol2index = dict(zip('ACGT', range(4)))
  2. def pattern2number(pattern: str) -> int:
  3. if not pattern:
  4. return 0
  5. number = symbol2index[pattern[-1]]
  6. return 4 * pattern2number(pattern[:-1]) + number

2、N进制

上面方法本质上就是四进制计数,因为一共有4个类型,因此可以用0, 1, 2, 3分别表示碱基A, C, G, T。例如我们枚举0-15序号的串

AAA
000
00 00 00
AAC
001
00 00 01
AAG
002
00 00 10
AAT
003
00 00 11
ACA
010
00 01 00
ACC
011
00 01 01
ACG
012
00 01 01
ACT
013
00 01 11
AGA
020
00 10 00
AGC
021
00 10 01
AGG
022
00 10 10
AGT
023
00 10 11
ATA
030
00 11 00
ATC
031
00 11 01
ATG
032
00 11 10
ATT
033
00 11 11

2位比特可以表示0-3范围的数,因此如上所示数字越小就表示其字典序越小

  1. symbol2index = dict(zip('ACGT', range(4)))
  2. def pattern2number(pattern: str) -> int:
  3. digit = 0
  4. for ch in pattern:
  5. digit = digit * 4 + symbol2index[ch]
  6. return digit
  7. print(pattern2number("AGTG"))

Implement NumberToPattern四进制数转对应字符串

rosalind.info_problems_ba1m_.png

1、从四进制数恢复字符串

上面字符串转四进制数的逆过程,我们可以对四进制数不断%4得最低位的权值,将此权值”翻译”回对应字符,然后不断重复将其/4获得次低位的权值,直到四进制数变为零。 :::info 注意:
题目给定四进制数index和最终字符串长度k的目的是防止index=0时,上述算法就会以为结束,输出位数可能不对。 :::

  1. def number2pattern(index: int, k: int) -> str:
  2. # 递归出口
  3. if k == 0:
  4. return ""
  5. # 递归链条
  6. prefixIndex, index = divmod(index, 4)
  7. return number2pattern(prefixIndex, k - 1) + index2symbol[index]
  8. print(number2pattern(45, 4))
  1. def number2pattern(index: int, k: int) -> str:
  2. ret = [''] * k
  3. while k > 0:
  4. index, r = divmod(index, 4)
  5. ret[k-1] = index2symbol[r] # 逆序填充
  6. k -= 1
  7. return "".join(ret)
  8. print(number2pattern(45, 4))

Find the Most Frequent Words with Mismatches in a String4^k个k-mer频数表

rosalind.info_problems_ba1k_.png

1、开足4k哈希桶(枚举所有k-mer串可能情况)

  1. from typing import List
  2. symbol2int = dict(zip('ACGT', range(4)))
  3. def pattern2number(pattern: str) -> int:
  4. if not pattern:
  5. return 0
  6. number = symbol2int[pattern[-1]]
  7. return 4 * pattern2number(pattern[:-1]) + number
  8. def computingFrequencyArray(text: str, k: int) -> List[int]:
  9. table = [0] * (4 ** k)
  10. for i in range(len(text) - k + 1):
  11. table[pattern2number(text[i:i+k])] += 1 # 转为4进制,hasher
  12. return table
  13. s = """ACGCGGCTCTGAAA
  14. 2
  15. """
  16. lines = s.splitlines()
  17. text = lines[0]
  18. k = int(lines[1])
  19. print(*computingFrequencyArray(text, k))

:::info 只是本题要求所有k-mer在Text串中的频数表,这样耗费1、WHERE IN THE GENOME DOES DNA REPLICATION BEGIN? - 图85空间。 :::

Find the Most Frequent Words with Mismatches in a String4^k个k-mer汉明距离≤d

rosalind.info_problems_ba1i_.png

1、枚举4k个k-mer并用暴力双指针匹配

  1. from typing import List
  2. def hammingDistance(s: str, t: str) -> int:
  3. dist = 0
  4. for ch1, ch2 in zip(s, t):
  5. if ch1 != ch2:
  6. dist += 1
  7. return dist
  8. def approximatePatternMatch(text: str, pattern: str, d: int) -> List[int]:
  9. m, n = len(text), len(pattern)
  10. ans = []
  11. for i in range(m - n + 1):
  12. if hammingDistance(text[i:i+n], pattern) <= d:
  13. ans.append(i)
  14. return ans
  15. def frequentWordswithMismatches(text: str, k: int, d: int) -> List[str]:
  16. def backtrace(n: int, path: List[str], ans: List[str]) -> None:
  17. if n == 0:
  18. ans.append("".join(path))
  19. return
  20. for base in 'AGCT':
  21. path.append(base)
  22. backtrace(n - 1, path, ans)
  23. path.pop()
  24. kmers = []
  25. backtrace(k, [], kmers)
  26. # print(kmers)
  27. ans, maxv = [], 0
  28. for pattern in kmers:
  29. cnt = len(approximatePatternMatch(text, pattern, d))
  30. if cnt > maxv:
  31. ans = [pattern]
  32. maxv = cnt
  33. elif cnt == maxv:
  34. ans.append(pattern)
  35. return ans
  36. s = """ACGTTGCATGTCGCATGATGCATGAGAGCT
  37. 4 1
  38. """
  39. lines = s.splitlines()
  40. text = lines[0]
  41. k, d= map(int, lines[1].split())
  42. print(*frequentWordswithMismatches(text, k, d))

并行计算:

Find Frequent Words with Mismatches and Reverse Complements

rosalind.info_problems_ba1j_ (1).png