gemsim[word2vec & doc2vec]
官方文档:https://radimrehurek.com/gensim/models/word2vec.html
https://rare-technologies.com/word2vec-tutorial/.
gensim介绍:python NLP的包
gensim包依赖于numpy包和scipy包,即需要先安装numpy和scipy,再安装gensim
word2vec
# 参考https://radimrehurek.com/gensim/models/word2vec.htmlfrom gensim.test.utils import common_texts, get_tmpfile # common_texts表示gensim包自带的可训练数据from gensim.models import Word2Vecfrom gensim.models import KeyedVectorsfrom gensim.test.utils import datapathpath = get_tmpfile("word2vec.model")model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) # word2vec中最重要的一条语句, size表示维数model.save("word2vec.model") # 【保存】模型model = Word2Vec.load("word2vec.model") # 【加载(方式1)】模型(需要继续训练时)model.train([["hello", "world"]], total_examples=1, epochs=1) # 添加新的文本进行训练vector = model.wv['computer'] # 得到单词的词向量print(vector)# wv = KeyedVectors.load("word2vec.model", mmap='r') # 【加载(方式2)】模型(不需要继续训练时,即没有新的本文加入。使用这种方式更小更快)# vector_1 = wv['computer'] # numpy vector of a word# print(vector_1)# wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=False) # C text format# wv_from_bin = KeyedVectors.load_word2vec_format(datapath('word2vec.model'), binary=True) # C bin format
doc2vec
# 参考文档:https://radimrehurek.com/gensim/models/doc2vec.htmlfrom gensim.test.utils import common_textsfrom gensim.models.doc2vec import Doc2Vec, TaggedDocumentfrom gensim.test.utils import get_tmpfile# fname = get_tmpfile("doc2vec.model")documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)model.save("doc2vec.model")model = Doc2Vec.load("doc2vec.model")vector = model.docvecs[0] # 第一行,即索引为0的行训练得到的向量# vector = model.infer_vector(["system", "response"]) # 预测新向量print(vector)
