重复字串

重复的DNA序列

image.png
image.png

image.png

  1. class Solution {
  2. public List<String> findRepeatedDnaSequences(String s) {
  3. int L = 10, n = s.length();
  4. HashSet<String> seen = new HashSet(), output = new HashSet();
  5. // iterate over all sequences of length L
  6. for (int start = 0; start < n - L + 1; ++start) {
  7. String tmp = s.substring(start, start + L);
  8. if (seen.contains(tmp)) output.add(tmp);
  9. seen.add(tmp);
  10. }
  11. return new ArrayList<String>(output);
  12. }
  13. }

image.png
image.png

  1. class Solution {
  2. public List<String> findRepeatedDnaSequences(String s) {
  3. int L = 10, n = s.length();
  4. if (n <= L) return new ArrayList();
  5. // rolling hash parameters: base a
  6. int a = 4, aL = (int)Math.pow(a, L);
  7. // convert string to array of integers
  8. Map<Character, Integer> toInt = new
  9. HashMap() {{put('A', 0); put('C', 1); put('G', 2); put('T', 3); }};
  10. int[] nums = new int[n];
  11. for(int i = 0; i < n; ++i) nums[i] = toInt.get(s.charAt(i));
  12. int h = 0;
  13. Set<Integer> seen = new HashSet();
  14. Set<String> output = new HashSet();
  15. // iterate over all sequences of length L
  16. for (int start = 0; start < n - L + 1; ++start) {
  17. // compute hash of the current sequence in O(1) time
  18. if (start != 0)
  19. h = h * a - nums[start - 1] * aL + nums[start + L - 1];
  20. // compute hash of the first sequence in O(L) time
  21. else
  22. for(int i = 0; i < L; ++i) h = h * a + nums[i];
  23. // update output and hashset of seen sequences
  24. if (seen.contains(h)) output.add(s.substring(start, start + L));
  25. seen.add(h);
  26. }
  27. return new ArrayList<String>(output);
  28. }
  29. }

image.png
image.png
image.png
image.png
image.png

  1. class Solution {
  2. public List<String> findRepeatedDnaSequences(String s) {
  3. int L = 10, n = s.length();
  4. if (n <= L) return new ArrayList();
  5. // rolling hash parameters: base a
  6. int a = 4, aL = (int)Math.pow(a, L);
  7. // convert string to array of integers
  8. Map<Character, Integer> toInt = new
  9. HashMap() {{put('A', 0); put('C', 1); put('G', 2); put('T', 3); }};
  10. int[] nums = new int[n];
  11. for(int i = 0; i < n; ++i) nums[i] = toInt.get(s.charAt(i));
  12. int bitmask = 0;
  13. Set<Integer> seen = new HashSet();
  14. Set<String> output = new HashSet();
  15. // iterate over all sequences of length L
  16. for (int start = 0; start < n - L + 1; ++start) {
  17. // compute bitmask of the current sequence in O(1) time
  18. if (start != 0) {
  19. // left shift to free the last 2 bit
  20. bitmask <<= 2;
  21. // add a new 2-bits number in the last two bits
  22. bitmask |= nums[start + L - 1];
  23. // unset first two bits: 2L-bit and (2L + 1)-bit
  24. bitmask &= ~(3 << 2 * L);
  25. }
  26. // compute hash of the first sequence in O(L) time
  27. else {
  28. for(int i = 0; i < L; ++i) {
  29. bitmask <<= 2;
  30. bitmask |= nums[i];
  31. }
  32. }
  33. // update output and hashset of seen sequences
  34. if (seen.contains(bitmask)) output.add(s.substring(start, start + L));
  35. seen.add(bitmask);
  36. }
  37. return new ArrayList<String>(output);
  38. }
  39. }

image.png