参考资料:
【python 走进NLP】搜索提示功能前缀字典树
巧用 Trie 树实现搜索引擎关键词提示功能
python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie)
站内搜索是一个网站的基本功能,一个好的搜索提示也能很好的提升用户体验,提高用户找到自己需要的东西的效率。下面基于字典树实现一个简单的中文前缀搜索提示功能。
用户输入的时候自动提示。
# -*- encoding=utf-8 -*-# 导入包from pytrie import StringTrie# 自定义字典树类class Suggester(object):def __init__(self):self.trie = Noneself.trie = StringTrie()def update_trie(self, word_list):for word in word_list:word = word.lower()self.trie[word] = worddef search_prefix(self, prefix):return self.trie.values(prefix=prefix)# 建立前缀字典树def build_prefix_tree(wordlist):word_list = open(wordlist).read().splitlines()suggester = Suggester()suggester.update_trie(word_list)return suggesterif __name__ == '__main__':sug = Suggester()sug.update_trie(['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂'])print(sug.search_prefix('意'))print(sug.search_prefix('意气'))
运行结果:
E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py['意想不到', '意兴阑珊', '意味深长', '意气风发', '意气高昂', '意气用事']['意气风发', '意气高昂', '意气用事']Process finished with exit code 0
更复杂的搜索提示功能:支持拼音,首字母拼音,中文等。类似优酷,酷狗等搜索提示功能。完美实现。

安装包:
pip install pytriepip install pandaspip install pypinyin

搜索提示功能实现:
# -*- encoding=utf-8 -*-# 导入包from pytrie import StringTrieimport pypinyinimport pandas as pdimport time# 文本转拼音def pinyin(text):""":param text: 文本:return: 文本转拼音"""gap = ' 'piny = gap.join(pypinyin.lazy_pinyin(text))return piny# 获取拼音的每个首字母def get_every_word_first(text):""":param text:文本:return: 返回拼音首字母"""return ''.join([i[0] for i in pinyin(text).split(' ')])# 获取拼音的第一个首字母def get_all_pinying(text):""":param text: 文本:return: 文本转拼音"""gap = ''piny = gap.join(pypinyin.lazy_pinyin(text))return piny# 自定义字典树类class Suggester(object):def __init__(self):self.trie = Noneself.trie = StringTrie()def update_trie(self, word_list):for word in word_list:word = word.lower()# 拼音提取word_pinyin1=get_every_word_first(word)word_pinyin2=get_all_pinying(word)# 拼音建立字典树self.trie[word] = wordself.trie[word_pinyin1]=word_pinyin1self.trie[word_pinyin2] = word_pinyin2def search_prefix(self, prefix):return self.trie.values(prefix=prefix)# 构建字典树def build_all_trie(wordlist):""":param wordlist: 关键词列表:return: 字典树和映射数据集"""sug = Suggester()sug.update_trie(wordlist)data = pd.DataFrame({"word": wordlist})data['pinyin1'] = data['word'].apply(lambda x: get_every_word_first(x))data['pinyin2'] = data['word'].apply(lambda x: get_all_pinying(x))return sug,data# 判断字符串只包含中文def check_contain_chinese(check_str):flag = Truefor ch in check_str:if u'\u4e00' >= ch or ch >= u'\u9fff':flag = Falsereturn flag# 关键词搜索提示查询def get_tips_word(sug,data,s):""":param sug: 字典树:param data: 中文和英文映射数据集:param s: 搜索词:return: 返回搜索提示词"""try:if len(s)>0:# 判断输入是否只包含中文,若只中文,按中文查if check_contain_chinese(s) is True:# 输出结果kk = sug.search_prefix(s)result3 = data[data['word'].isin(kk)]result6 = list(set(result3['word']))return result6# 若不是只包含中文,转换为英文去查询else:s1=get_all_pinying(s)kk = sug.search_prefix(s1)result1 = data[data['pinyin1'].isin(kk)]result2 = data[data['pinyin2'].isin(kk)]result3 = data[data['word'].isin(kk)]result4 = result1.append(result2, ignore_index=True)result5 = result3.append(result4, ignore_index=True)# 输出结果result6 = list(set(result5['word']))return result6else:returnexcept Exception as e:print("{0}".format(str(e)))if __name__ == '__main__':wordlist=['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂','意气相投','巴黎恋人','巴黎圣母院','巴黎宝贝']# 构造字典树sug, data = build_all_trie(wordlist)time1=time.time()# 搜索词s='b'result=get_tips_word(sug,data,s)print(result)time2=time.time()print('总共耗时:' + str(time2- time1) + 's')
运行结果:
E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py['巴黎圣母院', '巴黎宝贝', '巴黎恋人']总共耗时:0.002985239028930664sProcess finished with exit code 0
