核心概念
Document
文本,python中的字符串
document = "Human machine interface for lab abc computer applications"
Corpus
Document的集合,作为模型的输入
text_corpus = ["Human machine interface for lab abc computer applications","A survey of user opinion of computer system response time","The EPS user interface management system","System and human system engineering testing of EPS","Relation of user perceived response time to error measurement","The generation of random binary unordered trees","The intersection graph of paths in trees","Graph minors IV Widths of trees and well quasi ordering","Graph minors A survey",]
为了给语料中的每个单词一个ID,可以使用gensim.corpora.Dictionary
# Create a set of frequent wordsstoplist = set('for a of the and to in'.split(' '))# Lowercase each document, split it by white space and filter out stopwordstexts = [[word for word in document.lower().split() if word not in stoplist]for document in text_corpus]# Count word frequenciesfrom collections import defaultdictfrequency = defaultdict(int)for text in texts:for token in text:frequency[token] += 1# Only keep words that appear more than onceprocessed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]pprint.pprint(processed_corpus)'''[['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time'],['eps', 'user', 'interface', 'system'],['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees'],['graph', 'minors', 'trees'],['graph', 'minors', 'survey']]'''from gensim import corporadictionary = corpora.Dictionary(processed_corpus)print(dictionary)'''Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)'''
Vector
文档的向量表示
pprint.pprint(dictionary.token2id) # get token2id"""{'computer': 0,'eps': 8,'graph': 10,'human': 1,'interface': 2,'minors': 11,'response': 3,'survey': 4,'system': 5,'time': 6,'trees': 9,'user': 7}"""new_doc = "Human computer interaction"new_vec = dictionary.doc2bow(new_doc.lower().split()) # change document to vectorprint(new_vec)"""[(0, 1), (1, 1)] # first is token id, second is #token, token not in dictionary will be ignored"""bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]pprint.pprint(bow_corpus)"""[[(0, 1), (1, 1), (2, 1)],[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],[(2, 1), (5, 1), (7, 1), (8, 1)],[(1, 1), (5, 2), (8, 1)],[(3, 1), (6, 1), (7, 1)],[(9, 1)],[(9, 1), (10, 1)],[(9, 1), (10, 1), (11, 1)],[(4, 1), (10, 1), (11, 1)]]"""
Model
进行向量变换的算法,将原始的文档向量空间转换到目标空间
from gensim import models# train the modeltfidf = models.TfidfModel(bow_corpus)# transform the "system minors" stringwords = "system minors".lower().split()print(tfidf[dictionary.doc2bow(words)])"""[(5, 0.5898341626740045), (11, 0.8075244024440723)]"""from gensim import similaritiesindex = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)query_document = 'system engineering'.split()query_bow = dictionary.doc2bow(query_document)sims = index[tfidf[query_bow]]for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):print(document_number, score) # similarity between query and each document in bow_corpus"""3 0.71848122 0.417075721 0.324487030 0.04 0.05 0.06 0.07 0.08 0.0"""
小结
documents of corpus -> vector representation -> model transform raw vector
文本->向量
流式加载
from smart_open import open # for transparently opening remote filesclass MyCorpus(object): # 迭代器def __iter__(self):for line in open('https://radimrehurek.com/gensim/mycorpus.txt'):# assume there's one document per line, tokens separated by whitespaceyield dictionary.doc2bow(line.lower().split())corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!for vector in corpus_memory_friendly: # load one vector into memory at a timeprint(vector)"""[(0, 1), (1, 1), (2, 1)][(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)][(2, 1), (5, 1), (7, 1), (8, 1)][(1, 1), (5, 2), (8, 1)][(3, 1), (6, 1), (7, 1)][(9, 1)][(9, 1), (10, 1)][(9, 1), (10, 1), (11, 1)][(4, 1), (10, 1), (11, 1)]"""
from six import iteritems# collect statistics about all tokensdictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))# remove stop words and words that appear only oncestop_ids = [dictionary.token2id[stopword]for stopword in stoplistif stopword in dictionary.token2id]once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only oncedictionary.compactify() # remove gaps in id sequence after words that were removedprint(dictionary)"""Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)"""
相似度查询
from collections import defaultdictfrom gensim import corporadocuments = ["Human machine interface for lab abc computer applications","A survey of user opinion of computer system response time","The EPS user interface management system","System and human system engineering testing of EPS","Relation of user perceived response time to error measurement","The generation of random binary unordered trees","The intersection graph of paths in trees","Graph minors IV Widths of trees and well quasi ordering","Graph minors A survey",]# remove common words and tokenizestoplist = set('for a of the and to in'.split())texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]# remove words that appear only oncefrequency = defaultdict(int)for text in texts:for token in text:frequency[token] += 1texts = [[token for token in text if frequency[token] > 1]for text in texts]# build corpus vectordictionary = corpora.Dictionary(texts)corpus = [dictionary.doc2bow(text) for text in texts]# lsi model: (token_id, topic relation)from gensim import modelslsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)doc = "Human computer interaction"vec_bow = dictionary.doc2bow(doc.lower().split())vec_lsi = lsi[vec_bow] # convert the query to LSI spaceprint(vec_lsi)"""[(0, 0.4618210045327158), (1, 0.07002766527900064)]"""# initialize query indexfrom gensim import similaritiesindex = similarities.Similarity(lsi[corpus]) # transform corpus to LSI space and index it# save and load indexindex.save('/tmp/deerwester.index')index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')# query similaritysims = index[vec_lsi] # perform a similarity query against the corpusprint(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples"""[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]"""sims = sorted(enumerate(sims), key=lambda item: -item[1])for i, s in enumerate(sims):print(s, documents[i])"""(2, 0.9984453) Human machine interface for lab abc computer applications(0, 0.998093) A survey of user opinion of computer system response time(3, 0.9865886) The EPS user interface management system(1, 0.93748635) System and human system engineering testing of EPS(4, 0.90755945) Relation of user perceived response time to error measurement(8, 0.050041765) The generation of random binary unordered trees(7, -0.09879464) The intersection graph of paths in trees(6, -0.10639259) Graph minors IV Widths of trees and well quasi ordering(5, -0.12416792) Graph minors A survey"""
Word2Vec
加载预训练模型
import gensim.downloader as apiwv = api.load('word2vec-google-news-300') # load pretrained modelfor i, word in enumerate(wv.vocab): # vocab: [word]if i == 10:breakprint(word)"""</s>inforthatison##Thewithsaid"""try:vec_cameroon = wv['cameroon']except KeyError:print("The word 'cameroon' does not appear in this model")"""The word 'cameroon' does not appear in this model"""pairs = [('car', 'minivan'), # a minivan is a kind of car('car', 'bicycle'), # still a wheeled vehicle('car', 'airplane'), # ok, no wheels, but still a vehicle('car', 'cereal'), # ... and so on('car', 'communism'),]for w1, w2 in pairs:print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2))) # similarity: compute two word cosine similarity"""'car' 'minivan' 0.69'car' 'bicycle' 0.54'car' 'airplane' 0.42'car' 'cereal' 0.14'car' 'communism' 0.06"""print(wv.most_similar(positive=['car', 'minivan'], topn=5))"""[('SUV', 0.853219211101532), ('vehicle', 0.8175784349441528), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.756571888923645)]"""print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))"""car"""
训练自己的模型
from gensim.test.utils import datapathfrom gensim import utils# 加载数据,可以是迭代器、生成器class MyCorpus(object):"""An interator that yields sentences (lists of str)."""def __iter__(self):corpus_path = datapath('lee_background.cor')for line in open(corpus_path):# assume there's one document per line, tokens separated by whitespaceyield utils.simple_preprocess(line)import gensim.modelssentences = MyCorpus()model = gensim.models.Word2Vec(sentences=sentences)# The trained word vectors are stored in a KeyedVectors instance in model.wv:vector = model.wv['computer'] # numpy vector of a word# to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap).word_vectors = model.wvdel model
Word2Vec参数
- sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network.
- min_count (int, optional) – Ignores all words with total frequency lower than this.
- size (int, optional) – Dimensionality of the word vectors.
- window (int, optional) – Maximum distance between the current and predicted word within a sentence.
- workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines). install Cython to make it work!
- sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
- cbow_mean ({0, 1}, optional) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
- hs ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
- negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
- sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
compute_loss (bool, optional) – If True, computes and stores loss value which can be retrieved using
get_latest_training_loss().加载与保存
需要继续训练
model.save(filename),model.load(filename)
- 不需要继续训练 ```python from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors
path = get_tmpfile(“wordvectors.kv”)
model.wv.save(path) wv = KeyedVectors.load(“model.wv”, mmap=’r’) vector = wv[‘computer’] # numpy vector of a word
Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:
from gensim.test.utils import datapath wv_from_text = KeyedVectors.load_word2vec_format(datapath(‘word2vec_pre_kv_c’), binary=False) # C text format wv_from_bin = KeyedVectors.load_word2vec_format(datapath(“euclidean_vectors.bin”), binary=True) # C bin format
- 只用于初始化词向量和word2index```pythonimport gensimimport numpy as np# train word2vec# self.corpus: [[word]]model = gensim.models.word2vec.Word2Vec(self.corpus,size=300,min_count=1)# model.save('word2vec_redial')# save word embedding in numpy formatword2embedding = [[0] * 300] * 4 + [model[word] for word in word2index]+[[0]*300]print(np.shape(word2embedding))np.save('word2vec_redial.npy', word2embedding)# word2indexword2index = {word: i + 4 for i, word in enumerate(model.wv.index2word)}word2index['_split_']=len(word2index)+4json.dump(word2index, open('word2index_redial.json', 'w', encoding='utf-8'), ensure_ascii=False)# initialize word embedding in modele = nn.Embedding(len(dictionary)+4, embedding_size, padding_idx)e.weight.data.copy_(torch.from_numpy(np.load('word2vec_redial.npy')))
