代码

  1. #coding:utf-8
  2. import jieba
  3. from gensim import corpora,models,similarities
  4. doc0 = "我不喜欢上海"
  5. doc1 = "上海是一个好地方"
  6. doc2 = "北京是一个好地方"
  7. doc3 = "上海好吃的在哪里"
  8. doc4 = "上海好玩的在哪里"
  9. doc5 = "上海是好地方"
  10. doc6 = "上海路和上海人"
  11. doc7 = "喜欢小吃"
  12. doc_test="我喜欢上海的小吃"
  13. all_doc = []
  14. all_doc.append(doc0)
  15. all_doc.append(doc1)
  16. all_doc.append(doc2)
  17. all_doc.append(doc3)
  18. all_doc.append(doc4)
  19. all_doc.append(doc5)
  20. all_doc.append(doc6)
  21. all_doc.append(doc7)
  22. all_doc_list = []
  23. for doc in all_doc:
  24. doc_list = [word for word in jieba.cut(doc)]
  25. all_doc_list.append(doc_list)
  26. print(all_doc_list)
  27. doc_test_list = [word for word in jieba.cut(doc_test)]
  28. dictionary = corpora.Dictionary(all_doc_list)
  29. corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
  30. doc_test_vec = dictionary.doc2bow(doc_test_list)
  31. tfidf = models.TfidfModel(corpus)
  32. index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
  33. sim = index[tfidf[doc_test_vec]]
  34. print(sorted(enumerate(sim), key=lambda item: -item[1]))

结论

  • doc_test与文档7最相似。

参考资料