方案一:
#!/usr/bin python3
# -*- encoding: utf-8 -*-
'''
@File : charcorrect.py
@Time : 2020/07/28 11:27:03
@Author : 陈培杞
@Version : 1.0
@docs :
原始数据集样本格式
* wp_2gram.txt:
1 the git
2 in the
* eml.txt
Héllo I'm a programmér who crackéd your émail account and dévicé about
过程:
1. 将原始数据结构化为dict —— 按字符串长度进行分桶处理。{2:['aa','bb'], 8:['isthethe','aboutyou']}
2. 将字符转为ASCII码
3. 处理成 faiss 模块的输入形式,分桶搜索
'''
from pathlib import Path
project_path = Path(__file__).parent.resolve()
import re
import ast
import json
import numpy as np
from collections import defaultdict
class TrieTree(object):
"""
用于模糊搜索
"""
def __init__(self):
self.root={}
self.word_end = -1
def add(self,word):
nownode=self.root
for char in word:
if char not in nownode:
nownode[char] = {}
nownode = nownode[char]
nownode[self.word_end] = True
def search(self, word):
nownode = self.root
for char in word:
if char not in nownode:
return False
nownode = nownode[char]
if self.word_end not in nownode:
return False
return True
def fuzzyMatch(self, tree, word):
nowNode = tree
matchWord = ''
matchWordList=[]
for i,char in enumerate(word):
if char in nowNode:
matchWord += char
nowNode = nowNode[char]
else:
if ord(char) <= 122 : # 既不在树中,也不是通配符,返回空串
return ['']
else: # 通配符处理
for key in nowNode.keys():
nextNodes = nowNode[key]
if key == self.word_end:
return ['']
NextMatchWordList = self.fuzzyMatch(nextNodes, word[i+1:])
matchWordList += [ matchWord + key + chars for chars in NextMatchWordList]
if matchWordList :
return matchWordList
else:
return [matchWord]
def getTree(self):
return self.root
def calculateSimilarity( searchDict:dict, charlength=5):
key = charlength
dict_eml = getProcessedData(f"{EmlSaveFile}_{charlength}")
dict_wiki = getProcessedData(f"{WikiSaveFile}_{charlength}")
# trie tree 检索
mytree = TrieTree()
for word in dict_wiki[key]:
mytree.add(word)
for word in dict_eml[key]:
matchResult = mytree.fuzzyMatch(mytree.getTree(), word)
matchResult = [ i for i in matchResult if len(i)==len(word) ]
searchDict[word] = list(set(matchResult))
if __name__=='__main__':
searchDict = dict()
calculateSimilarity(searchDict)
方案二:
import collections
class TrieNode:
# Initialize your data structure here.
def __init__(self):
self.children = collections.defaultdict(TrieNode)
self.is_word = False
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
current = self.root
for letter in word:
current = current.children[letter]
current.is_word = True
def search(self, word):
current = self.root
for letter in word:
current = current.children.get(letter)
if current is None:
return False
return current.is_word
def startsWith(self, prefix):
current = self.root
for letter in prefix:
current = current.children.get(letter)
if current is None:
return False
return True
def enumerateMatch(self, word, space="_", backward=False):
matched = []
## while len(word) > 1 does not keep character itself, while word keed character itself
while len(word) > 1:
if self.search(word):
matched.append(space.join(word[:]))
del word[-1]
return matched