参考资料:
    【python 走进NLP】搜索提示功能前缀字典树
    巧用 Trie 树实现搜索引擎关键词提示功能
    python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie)
    【python 走进NLP】搜索提示功能前缀字典树 - 图1
    站内搜索是一个网站的基本功能,一个好的搜索提示也能很好的提升用户体验,提高用户找到自己需要的东西的效率。下面基于字典树实现一个简单的中文前缀搜索提示功能。
    用户输入的时候自动提示。
    image.png

    1. # -*- encoding=utf-8 -*-
    2. # 导入包
    3. from pytrie import StringTrie
    4. # 自定义字典树类
    5. class Suggester(object):
    6. def __init__(self):
    7. self.trie = None
    8. self.trie = StringTrie()
    9. def update_trie(self, word_list):
    10. for word in word_list:
    11. word = word.lower()
    12. self.trie[word] = word
    13. def search_prefix(self, prefix):
    14. return self.trie.values(prefix=prefix)
    15. # 建立前缀字典树
    16. def build_prefix_tree(wordlist):
    17. word_list = open(wordlist).read().splitlines()
    18. suggester = Suggester()
    19. suggester.update_trie(word_list)
    20. return suggester
    21. if __name__ == '__main__':
    22. sug = Suggester()
    23. sug.update_trie(['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂'])
    24. print(sug.search_prefix('意'))
    25. print(sug.search_prefix('意气'))

    运行结果:

    1. E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py
    2. ['意想不到', '意兴阑珊', '意味深长', '意气风发', '意气高昂', '意气用事']
    3. ['意气风发', '意气高昂', '意气用事']
    4. Process finished with exit code 0

    更复杂的搜索提示功能:支持拼音,首字母拼音,中文等。类似优酷,酷狗等搜索提示功能。完美实现。
    image.png
    image.png
    安装包:

    1. pip install pytrie
    2. pip install pandas
    3. pip install pypinyin

    image.png
    搜索提示功能实现:

    1. # -*- encoding=utf-8 -*-
    2. # 导入包
    3. from pytrie import StringTrie
    4. import pypinyin
    5. import pandas as pd
    6. import time
    7. # 文本转拼音
    8. def pinyin(text):
    9. """
    10. :param text: 文本
    11. :return: 文本转拼音
    12. """
    13. gap = ' '
    14. piny = gap.join(pypinyin.lazy_pinyin(text))
    15. return piny
    16. # 获取拼音的每个首字母
    17. def get_every_word_first(text):
    18. """
    19. :param text:文本
    20. :return: 返回拼音首字母
    21. """
    22. return ''.join([i[0] for i in pinyin(text).split(' ')])
    23. # 获取拼音的第一个首字母
    24. def get_all_pinying(text):
    25. """
    26. :param text: 文本
    27. :return: 文本转拼音
    28. """
    29. gap = ''
    30. piny = gap.join(pypinyin.lazy_pinyin(text))
    31. return piny
    32. # 自定义字典树类
    33. class Suggester(object):
    34. def __init__(self):
    35. self.trie = None
    36. self.trie = StringTrie()
    37. def update_trie(self, word_list):
    38. for word in word_list:
    39. word = word.lower()
    40. # 拼音提取
    41. word_pinyin1=get_every_word_first(word)
    42. word_pinyin2=get_all_pinying(word)
    43. # 拼音建立字典树
    44. self.trie[word] = word
    45. self.trie[word_pinyin1]=word_pinyin1
    46. self.trie[word_pinyin2] = word_pinyin2
    47. def search_prefix(self, prefix):
    48. return self.trie.values(prefix=prefix)
    49. # 构建字典树
    50. def build_all_trie(wordlist):
    51. """
    52. :param wordlist: 关键词列表
    53. :return: 字典树和映射数据集
    54. """
    55. sug = Suggester()
    56. sug.update_trie(wordlist)
    57. data = pd.DataFrame({"word": wordlist})
    58. data['pinyin1'] = data['word'].apply(lambda x: get_every_word_first(x))
    59. data['pinyin2'] = data['word'].apply(lambda x: get_all_pinying(x))
    60. return sug,data
    61. # 判断字符串只包含中文
    62. def check_contain_chinese(check_str):
    63. flag = True
    64. for ch in check_str:
    65. if u'\u4e00' >= ch or ch >= u'\u9fff':
    66. flag = False
    67. return flag
    68. # 关键词搜索提示查询
    69. def get_tips_word(sug,data,s):
    70. """
    71. :param sug: 字典树
    72. :param data: 中文和英文映射数据集
    73. :param s: 搜索词
    74. :return: 返回搜索提示词
    75. """
    76. try:
    77. if len(s)>0:
    78. # 判断输入是否只包含中文,若只中文,按中文查
    79. if check_contain_chinese(s) is True:
    80. # 输出结果
    81. kk = sug.search_prefix(s)
    82. result3 = data[data['word'].isin(kk)]
    83. result6 = list(set(result3['word']))
    84. return result6
    85. # 若不是只包含中文,转换为英文去查询
    86. else:
    87. s1=get_all_pinying(s)
    88. kk = sug.search_prefix(s1)
    89. result1 = data[data['pinyin1'].isin(kk)]
    90. result2 = data[data['pinyin2'].isin(kk)]
    91. result3 = data[data['word'].isin(kk)]
    92. result4 = result1.append(result2, ignore_index=True)
    93. result5 = result3.append(result4, ignore_index=True)
    94. # 输出结果
    95. result6 = list(set(result5['word']))
    96. return result6
    97. else:
    98. return
    99. except Exception as e:
    100. print("{0}".format(str(e)))
    101. if __name__ == '__main__':
    102. wordlist=['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂','意气相投','巴黎恋人','巴黎圣母院','巴黎宝贝']
    103. # 构造字典树
    104. sug, data = build_all_trie(wordlist)
    105. time1=time.time()
    106. # 搜索词
    107. s='b'
    108. result=get_tips_word(sug,data,s)
    109. print(result)
    110. time2=time.time()
    111. print('总共耗时:' + str(time2- time1) + 's')

    运行结果:

    1. E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py
    2. ['巴黎圣母院', '巴黎宝贝', '巴黎恋人']
    3. 总共耗时:0.002985239028930664s
    4. Process finished with exit code 0