核心概念
文本->向量
- 流式加载
相似度查询
Word2Vec
Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:

核心概念

Document

文本，python中的字符串

document = "Human machine interface for lab abc computer applications"

Corpus

Document的集合，作为模型的输入

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

为了给语料中的每个单词一个ID，可以使用gensim.corpora.Dictionary

# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)
'''
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]
 '''
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)
'''
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
'''

Vector

文档的向量表示

pprint.pprint(dictionary.token2id)  # get token2id
"""
{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}
 """
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())  # change document to vector
print(new_vec)
"""
[(0, 1), (1, 1)]  # first is token id, second is #token, token not in dictionary will be ignored
"""
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)
"""
[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]
 """

Model

进行向量变换的算法，将原始的文档向量空间转换到目标空间

from gensim import models
# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])
"""
[(5, 0.5898341626740045), (11, 0.8075244024440723)]
"""
from gensim import similarities
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)  # similarity between query and each document in bow_corpus
"""
3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
"""

小结

documents of corpus -> vector representation -> model transform raw vector

文本->向量

基本方法和上面corpus、vector介绍的一致

流式加载

from smart_open import open  # for transparently opening remote files
class MyCorpus(object):  # 迭代器
    def __iter__(self):
        for line in open('https://radimrehurek.com/gensim/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)
"""
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
"""

from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)
"""
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
"""

Gensim的一大特色在于对内存的依赖小，支持生成器

相似度查询

from collections import defaultdict
from gensim import corpora
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]
# build corpus vector
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# lsi model: (token_id, topic relation)
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)
"""
[(0, 0.4618210045327158), (1, 0.07002766527900064)]
"""
# initialize query index
from gensim import similarities
index = similarities.Similarity(lsi[corpus])  # transform corpus to LSI space and index it
# save and load index
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
# query similarity
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples
"""
[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]
"""
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for i, s in enumerate(sims):
    print(s, documents[i])
"""
(2, 0.9984453) Human machine interface for lab abc computer applications
(0, 0.998093) A survey of user opinion of computer system response time
(3, 0.9865886) The EPS user interface management system
(1, 0.93748635) System and human system engineering testing of EPS
(4, 0.90755945) Relation of user perceived response time to error measurement
(8, 0.050041765) The generation of random binary unordered trees
(7, -0.09879464) The intersection graph of paths in trees
(6, -0.10639259) Graph minors IV Widths of trees and well quasi ordering
(5, -0.12416792) Graph minors A survey
"""

Word2Vec

加载预训练模型

import gensim.downloader as api
wv = api.load('word2vec-google-news-300')  # load pretrained model
for i, word in enumerate(wv.vocab):  # vocab: [word]
    if i == 10:
        break
    print(word)
"""
</s>
in
for
that
is
on
##
The
with
said
"""
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")
"""
The word 'cameroon' does not appear in this model
"""
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))  # similarity: compute two word cosine similarity
"""
'car'   'minivan'       0.69
'car'   'bicycle'       0.54
'car'   'airplane'      0.42
'car'   'cereal'        0.14
'car'   'communism'     0.06
"""
print(wv.most_similar(positive=['car', 'minivan'], topn=5))
"""
[('SUV', 0.853219211101532), ('vehicle', 0.8175784349441528), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.756571888923645)]
"""
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))
"""
car
"""

训练自己的模型

from gensim.test.utils import datapath
from gensim import utils
# 加载数据，可以是迭代器、生成器
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)
import gensim.models
sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)
# The trained word vectors are stored in a KeyedVectors instance in model.wv:
vector = model.wv['computer']  # numpy vector of a word
# to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap).
word_vectors = model.wv
del model

Word2Vec参数

sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network.
min_count (int, optional) – Ignores all words with total frequency lower than this.
size (int, optional) – Dimensionality of the word vectors.
window (int, optional) – Maximum distance between the current and predicted word within a sentence.
workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines). install Cython to make it work!
sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
cbow_mean ({0, 1}, optional) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hs ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
compute_loss (bool, optional) – If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

加载与保存
需要继续训练

model.save(filename),model.load(filename)

不需要继续训练 ```python from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors

path = get_tmpfile(“wordvectors.kv”)

model.wv.save(path) wv = KeyedVectors.load(“model.wv”, mmap=’r’) vector = wv[‘computer’] # numpy vector of a word

Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:

from gensim.test.utils import datapath wv_from_text = KeyedVectors.load_word2vec_format(datapath(‘word2vec_pre_kv_c’), binary=False) # C text format wv_from_bin = KeyedVectors.load_word2vec_format(datapath(“euclidean_vectors.bin”), binary=True) # C bin format


- 只用于初始化词向量和word2index
```python
import gensim
import numpy as np
# train word2vec
# self.corpus: [[word]]
model = gensim.models.word2vec.Word2Vec(self.corpus,size=300,min_count=1)
# model.save('word2vec_redial')
# save word embedding in numpy format
word2embedding = [[0] * 300] * 4 + [model[word] for word in word2index]+[[0]*300]
print(np.shape(word2embedding))
np.save('word2vec_redial.npy', word2embedding)
# word2index
word2index = {word: i + 4 for i, word in enumerate(model.wv.index2word)}
word2index['_split_']=len(word2index)+4
json.dump(word2index, open('word2index_redial.json', 'w', encoding='utf-8'), ensure_ascii=False)
# initialize word embedding in model
e = nn.Embedding(len(dictionary)+4, embedding_size, padding_idx)
e.weight.data.copy_(torch.from_numpy(np.load('word2vec_redial.npy')))

Gensim