方案一:

    1. #!/usr/bin python3
    2. # -*- encoding: utf-8 -*-
    3. '''
    4. @File : charcorrect.py
    5. @Time : 2020/07/28 11:27:03
    6. @Author : 陈培杞
    7. @Version : 1.0
    8. @docs :
    9. 原始数据集样本格式
    10. * wp_2gram.txt:
    11. 1 the git
    12. 2 in the
    13. * eml.txt
    14. Héllo I'm a programmér who crackéd your émail account and dévicé about
    15. 过程:
    16. 1. 将原始数据结构化为dict —— 按字符串长度进行分桶处理。{2:['aa','bb'], 8:['isthethe','aboutyou']}
    17. 2. 将字符转为ASCII码
    18. 3. 处理成 faiss 模块的输入形式,分桶搜索
    19. '''
    20. from pathlib import Path
    21. project_path = Path(__file__).parent.resolve()
    22. import re
    23. import ast
    24. import json
    25. import numpy as np
    26. from collections import defaultdict
    27. class TrieTree(object):
    28. """
    29. 用于模糊搜索
    30. """
    31. def __init__(self):
    32. self.root={}
    33. self.word_end = -1
    34. def add(self,word):
    35. nownode=self.root
    36. for char in word:
    37. if char not in nownode:
    38. nownode[char] = {}
    39. nownode = nownode[char]
    40. nownode[self.word_end] = True
    41. def search(self, word):
    42. nownode = self.root
    43. for char in word:
    44. if char not in nownode:
    45. return False
    46. nownode = nownode[char]
    47. if self.word_end not in nownode:
    48. return False
    49. return True
    50. def fuzzyMatch(self, tree, word):
    51. nowNode = tree
    52. matchWord = ''
    53. matchWordList=[]
    54. for i,char in enumerate(word):
    55. if char in nowNode:
    56. matchWord += char
    57. nowNode = nowNode[char]
    58. else:
    59. if ord(char) <= 122 : # 既不在树中,也不是通配符,返回空串
    60. return ['']
    61. else: # 通配符处理
    62. for key in nowNode.keys():
    63. nextNodes = nowNode[key]
    64. if key == self.word_end:
    65. return ['']
    66. NextMatchWordList = self.fuzzyMatch(nextNodes, word[i+1:])
    67. matchWordList += [ matchWord + key + chars for chars in NextMatchWordList]
    68. if matchWordList :
    69. return matchWordList
    70. else:
    71. return [matchWord]
    72. def getTree(self):
    73. return self.root
    74. def calculateSimilarity( searchDict:dict, charlength=5):
    75. key = charlength
    76. dict_eml = getProcessedData(f"{EmlSaveFile}_{charlength}")
    77. dict_wiki = getProcessedData(f"{WikiSaveFile}_{charlength}")
    78. # trie tree 检索
    79. mytree = TrieTree()
    80. for word in dict_wiki[key]:
    81. mytree.add(word)
    82. for word in dict_eml[key]:
    83. matchResult = mytree.fuzzyMatch(mytree.getTree(), word)
    84. matchResult = [ i for i in matchResult if len(i)==len(word) ]
    85. searchDict[word] = list(set(matchResult))
    86. if __name__=='__main__':
    87. searchDict = dict()
    88. calculateSimilarity(searchDict)

    方案二:

    1. import collections
    2. class TrieNode:
    3. # Initialize your data structure here.
    4. def __init__(self):
    5. self.children = collections.defaultdict(TrieNode)
    6. self.is_word = False
    7. class Trie:
    8. def __init__(self):
    9. self.root = TrieNode()
    10. def insert(self, word):
    11. current = self.root
    12. for letter in word:
    13. current = current.children[letter]
    14. current.is_word = True
    15. def search(self, word):
    16. current = self.root
    17. for letter in word:
    18. current = current.children.get(letter)
    19. if current is None:
    20. return False
    21. return current.is_word
    22. def startsWith(self, prefix):
    23. current = self.root
    24. for letter in prefix:
    25. current = current.children.get(letter)
    26. if current is None:
    27. return False
    28. return True
    29. def enumerateMatch(self, word, space="_", backward=False):
    30. matched = []
    31. ## while len(word) > 1 does not keep character itself, while word keed character itself
    32. while len(word) > 1:
    33. if self.search(word):
    34. matched.append(space.join(word[:]))
    35. del word[-1]
    36. return matched