第三方协助算法 - trie tree 字典树（前缀树）模糊搜索用 - 《算法》

方案一：

#!/usr/bin python3
# -*- encoding: utf-8 -*-
'''
@File    :   charcorrect.py
@Time    :   2020/07/28 11:27:03
@Author  :   陈培杞
@Version :   1.0
@docs    :  
        原始数据集样本格式
            * wp_2gram.txt:
                1 the git
                2 in the
            * eml.txt
                Héllo I'm a programmér who crackéd your émail account and dévicé about
        过程：
            1. 将原始数据结构化为dict —— 按字符串长度进行分桶处理。{2:['aa','bb'], 8:['isthethe','aboutyou']}
            2. 将字符转为ASCII码
            3. 处理成 faiss 模块的输入形式，分桶搜索
'''
from pathlib import Path
project_path = Path(__file__).parent.resolve()
import re
import ast
import json
import numpy as np
from collections import defaultdict
class TrieTree(object):
    """
        用于模糊搜索
    """
    def __init__(self):
        self.root={}
        self.word_end = -1
    def add(self,word):
        nownode=self.root
        for char in word:
            if char not in nownode:
                nownode[char] = {}
            nownode = nownode[char]
        nownode[self.word_end] = True
    def search(self, word):
        nownode = self.root
        for char in word:
            if  char not in nownode:
                return False
            nownode = nownode[char]
        if self.word_end not in nownode:
            return False
        return True 
    def fuzzyMatch(self, tree, word):
        nowNode = tree
        matchWord = ''
        matchWordList=[]
        for i,char in enumerate(word):
            if char  in nowNode:
                matchWord += char
                nowNode = nowNode[char]
            else:
                if ord(char) <= 122 :  # 既不在树中，也不是通配符，返回空串
                    return ['']
                else: # 通配符处理
                    for key in nowNode.keys():
                        nextNodes = nowNode[key]
                        if key == self.word_end:
                            return ['']
                        NextMatchWordList = self.fuzzyMatch(nextNodes, word[i+1:])                        
                        matchWordList += [ matchWord + key + chars for chars in NextMatchWordList]
        if matchWordList :
            return matchWordList
        else:
            return [matchWord]
    def getTree(self):
        return self.root
def calculateSimilarity( searchDict:dict, charlength=5):
    key = charlength
    dict_eml  = getProcessedData(f"{EmlSaveFile}_{charlength}")
    dict_wiki = getProcessedData(f"{WikiSaveFile}_{charlength}")
    # trie tree 检索
    mytree = TrieTree()
    for word in dict_wiki[key]:
        mytree.add(word)
    for word in dict_eml[key]:
        matchResult = mytree.fuzzyMatch(mytree.getTree(), word)
        matchResult = [ i for i in matchResult if len(i)==len(word) ]
        searchDict[word] = list(set(matchResult))
if __name__=='__main__':
        searchDict = dict()
        calculateSimilarity(searchDict)

方案二：

import collections
class TrieNode:
    # Initialize your data structure here.
    def __init__(self):
        self.children = collections.defaultdict(TrieNode)
        self.is_word = False
class Trie:
    def __init__(self):
        self.root = TrieNode()
    def insert(self, word):
        current = self.root
        for letter in word:
            current = current.children[letter]
        current.is_word = True
    def search(self, word):
        current = self.root
        for letter in word:
            current = current.children.get(letter)
            if current is None:
                return False
        return current.is_word
    def startsWith(self, prefix):
        current = self.root
        for letter in prefix:
            current = current.children.get(letter)
            if current is None:
                return False
        return True
    def enumerateMatch(self, word, space="_", backward=False):
        matched = []
        ## while len(word) > 1 does not keep character itself, while word keed character itself
        while len(word) > 1:
            if self.search(word):
                matched.append(space.join(word[:]))
            del word[-1]
        return matched