gemsim[word2vec & doc2vec]

官方文档:https://radimrehurek.com/gensim/models/word2vec.html
https://rare-technologies.com/word2vec-tutorial/.
gensim介绍:python NLP的包
gensim包依赖于numpy包和scipy包,即需要先安装numpy和scipy,再安装gensim

word2vec

  1. # 参考https://radimrehurek.com/gensim/models/word2vec.html
  2. from gensim.test.utils import common_texts, get_tmpfile # common_texts表示gensim包自带的可训练数据
  3. from gensim.models import Word2Vec
  4. from gensim.models import KeyedVectors
  5. from gensim.test.utils import datapath
  6. path = get_tmpfile("word2vec.model")
  7. model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) # word2vec中最重要的一条语句, size表示维数
  8. model.save("word2vec.model") # 【保存】模型
  9. model = Word2Vec.load("word2vec.model") # 【加载(方式1)】模型(需要继续训练时)
  10. model.train([["hello", "world"]], total_examples=1, epochs=1) # 添加新的文本进行训练
  11. vector = model.wv['computer'] # 得到单词的词向量
  12. print(vector)
  13. # wv = KeyedVectors.load("word2vec.model", mmap='r') # 【加载(方式2)】模型(不需要继续训练时,即没有新的本文加入。使用这种方式更小更快)
  14. # vector_1 = wv['computer'] # numpy vector of a word
  15. # print(vector_1)
  16. # wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=False) # C text format
  17. # wv_from_bin = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=True) # C bin format

doc2vec

  1. # 参考文档:https://radimrehurek.com/gensim/models/doc2vec.html
  2. from gensim.test.utils import common_texts
  3. from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  4. from gensim.test.utils import get_tmpfile
  5. # fname = get_tmpfile("doc2vec.model")
  6. documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
  7. model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
  8. model.save("doc2vec.model")
  9. model = Doc2Vec.load("doc2vec.model")
  10. vector = model.docvecs[0] # 第一行,即索引为0的行训练得到的向量
  11. # vector = model.infer_vector(["system", "response"]) # 预测新向量
  12. print(vector)