gemsim[word2vec & doc2vec]
官方文档:https://radimrehurek.com/gensim/models/word2vec.html
https://rare-technologies.com/word2vec-tutorial/.
gensim介绍:python NLP的包
gensim包依赖于numpy包和scipy包,即需要先安装numpy和scipy,再安装gensim
word2vec
# 参考https://radimrehurek.com/gensim/models/word2vec.html
from gensim.test.utils import common_texts, get_tmpfile # common_texts表示gensim包自带的可训练数据
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) # word2vec中最重要的一条语句, size表示维数
model.save("word2vec.model") # 【保存】模型
model = Word2Vec.load("word2vec.model") # 【加载(方式1)】模型(需要继续训练时)
model.train([["hello", "world"]], total_examples=1, epochs=1) # 添加新的文本进行训练
vector = model.wv['computer'] # 得到单词的词向量
print(vector)
# wv = KeyedVectors.load("word2vec.model", mmap='r') # 【加载(方式2)】模型(不需要继续训练时,即没有新的本文加入。使用这种方式更小更快)
# vector_1 = wv['computer'] # numpy vector of a word
# print(vector_1)
# wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=False) # C text format
# wv_from_bin = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=True) # C bin format
doc2vec
# 参考文档:https://radimrehurek.com/gensim/models/doc2vec.html
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
# fname = get_tmpfile("doc2vec.model")
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
model.save("doc2vec.model")
model = Doc2Vec.load("doc2vec.model")
vector = model.docvecs[0] # 第一行,即索引为0的行训练得到的向量
# vector = model.infer_vector(["system", "response"]) # 预测新向量
print(vector)