核心概念

Document

文本,python中的字符串

  1. document = "Human machine interface for lab abc computer applications"

Corpus

Document的集合,作为模型的输入

  1. text_corpus = [
  2. "Human machine interface for lab abc computer applications",
  3. "A survey of user opinion of computer system response time",
  4. "The EPS user interface management system",
  5. "System and human system engineering testing of EPS",
  6. "Relation of user perceived response time to error measurement",
  7. "The generation of random binary unordered trees",
  8. "The intersection graph of paths in trees",
  9. "Graph minors IV Widths of trees and well quasi ordering",
  10. "Graph minors A survey",
  11. ]

为了给语料中的每个单词一个ID,可以使用gensim.corpora.Dictionary

  1. # Create a set of frequent words
  2. stoplist = set('for a of the and to in'.split(' '))
  3. # Lowercase each document, split it by white space and filter out stopwords
  4. texts = [[word for word in document.lower().split() if word not in stoplist]
  5. for document in text_corpus]
  6. # Count word frequencies
  7. from collections import defaultdict
  8. frequency = defaultdict(int)
  9. for text in texts:
  10. for token in text:
  11. frequency[token] += 1
  12. # Only keep words that appear more than once
  13. processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
  14. pprint.pprint(processed_corpus)
  15. '''
  16. [['human', 'interface', 'computer'],
  17. ['survey', 'user', 'computer', 'system', 'response', 'time'],
  18. ['eps', 'user', 'interface', 'system'],
  19. ['system', 'human', 'system', 'eps'],
  20. ['user', 'response', 'time'],
  21. ['trees'],
  22. ['graph', 'trees'],
  23. ['graph', 'minors', 'trees'],
  24. ['graph', 'minors', 'survey']]
  25. '''
  26. from gensim import corpora
  27. dictionary = corpora.Dictionary(processed_corpus)
  28. print(dictionary)
  29. '''
  30. Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
  31. '''

Vector

文档的向量表示

  1. pprint.pprint(dictionary.token2id) # get token2id
  2. """
  3. {'computer': 0,
  4. 'eps': 8,
  5. 'graph': 10,
  6. 'human': 1,
  7. 'interface': 2,
  8. 'minors': 11,
  9. 'response': 3,
  10. 'survey': 4,
  11. 'system': 5,
  12. 'time': 6,
  13. 'trees': 9,
  14. 'user': 7}
  15. """
  16. new_doc = "Human computer interaction"
  17. new_vec = dictionary.doc2bow(new_doc.lower().split()) # change document to vector
  18. print(new_vec)
  19. """
  20. [(0, 1), (1, 1)] # first is token id, second is #token, token not in dictionary will be ignored
  21. """
  22. bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
  23. pprint.pprint(bow_corpus)
  24. """
  25. [[(0, 1), (1, 1), (2, 1)],
  26. [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
  27. [(2, 1), (5, 1), (7, 1), (8, 1)],
  28. [(1, 1), (5, 2), (8, 1)],
  29. [(3, 1), (6, 1), (7, 1)],
  30. [(9, 1)],
  31. [(9, 1), (10, 1)],
  32. [(9, 1), (10, 1), (11, 1)],
  33. [(4, 1), (10, 1), (11, 1)]]
  34. """

Model

进行向量变换的算法,将原始的文档向量空间转换到目标空间

  1. from gensim import models
  2. # train the model
  3. tfidf = models.TfidfModel(bow_corpus)
  4. # transform the "system minors" string
  5. words = "system minors".lower().split()
  6. print(tfidf[dictionary.doc2bow(words)])
  7. """
  8. [(5, 0.5898341626740045), (11, 0.8075244024440723)]
  9. """
  10. from gensim import similarities
  11. index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)
  12. query_document = 'system engineering'.split()
  13. query_bow = dictionary.doc2bow(query_document)
  14. sims = index[tfidf[query_bow]]
  15. for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
  16. print(document_number, score) # similarity between query and each document in bow_corpus
  17. """
  18. 3 0.7184812
  19. 2 0.41707572
  20. 1 0.32448703
  21. 0 0.0
  22. 4 0.0
  23. 5 0.0
  24. 6 0.0
  25. 7 0.0
  26. 8 0.0
  27. """

小结

documents of corpus -> vector representation -> model transform raw vector

文本->向量

基本方法和上面corpus、vector介绍的一致

流式加载

  1. from smart_open import open # for transparently opening remote files
  2. class MyCorpus(object): # 迭代器
  3. def __iter__(self):
  4. for line in open('https://radimrehurek.com/gensim/mycorpus.txt'):
  5. # assume there's one document per line, tokens separated by whitespace
  6. yield dictionary.doc2bow(line.lower().split())
  7. corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
  8. for vector in corpus_memory_friendly: # load one vector into memory at a time
  9. print(vector)
  10. """
  11. [(0, 1), (1, 1), (2, 1)]
  12. [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
  13. [(2, 1), (5, 1), (7, 1), (8, 1)]
  14. [(1, 1), (5, 2), (8, 1)]
  15. [(3, 1), (6, 1), (7, 1)]
  16. [(9, 1)]
  17. [(9, 1), (10, 1)]
  18. [(9, 1), (10, 1), (11, 1)]
  19. [(4, 1), (10, 1), (11, 1)]
  20. """
  1. from six import iteritems
  2. # collect statistics about all tokens
  3. dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
  4. # remove stop words and words that appear only once
  5. stop_ids = [
  6. dictionary.token2id[stopword]
  7. for stopword in stoplist
  8. if stopword in dictionary.token2id
  9. ]
  10. once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
  11. dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
  12. dictionary.compactify() # remove gaps in id sequence after words that were removed
  13. print(dictionary)
  14. """
  15. Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
  16. """

Gensim的一大特色在于对内存的依赖小,支持生成器

相似度查询

  1. from collections import defaultdict
  2. from gensim import corpora
  3. documents = [
  4. "Human machine interface for lab abc computer applications",
  5. "A survey of user opinion of computer system response time",
  6. "The EPS user interface management system",
  7. "System and human system engineering testing of EPS",
  8. "Relation of user perceived response time to error measurement",
  9. "The generation of random binary unordered trees",
  10. "The intersection graph of paths in trees",
  11. "Graph minors IV Widths of trees and well quasi ordering",
  12. "Graph minors A survey",
  13. ]
  14. # remove common words and tokenize
  15. stoplist = set('for a of the and to in'.split())
  16. texts = [
  17. [word for word in document.lower().split() if word not in stoplist]
  18. for document in documents
  19. ]
  20. # remove words that appear only once
  21. frequency = defaultdict(int)
  22. for text in texts:
  23. for token in text:
  24. frequency[token] += 1
  25. texts = [
  26. [token for token in text if frequency[token] > 1]
  27. for text in texts
  28. ]
  29. # build corpus vector
  30. dictionary = corpora.Dictionary(texts)
  31. corpus = [dictionary.doc2bow(text) for text in texts]
  32. # lsi model: (token_id, topic relation)
  33. from gensim import models
  34. lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
  35. doc = "Human computer interaction"
  36. vec_bow = dictionary.doc2bow(doc.lower().split())
  37. vec_lsi = lsi[vec_bow] # convert the query to LSI space
  38. print(vec_lsi)
  39. """
  40. [(0, 0.4618210045327158), (1, 0.07002766527900064)]
  41. """
  42. # initialize query index
  43. from gensim import similarities
  44. index = similarities.Similarity(lsi[corpus]) # transform corpus to LSI space and index it
  45. # save and load index
  46. index.save('/tmp/deerwester.index')
  47. index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
  48. # query similarity
  49. sims = index[vec_lsi] # perform a similarity query against the corpus
  50. print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples
  51. """
  52. [(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]
  53. """
  54. sims = sorted(enumerate(sims), key=lambda item: -item[1])
  55. for i, s in enumerate(sims):
  56. print(s, documents[i])
  57. """
  58. (2, 0.9984453) Human machine interface for lab abc computer applications
  59. (0, 0.998093) A survey of user opinion of computer system response time
  60. (3, 0.9865886) The EPS user interface management system
  61. (1, 0.93748635) System and human system engineering testing of EPS
  62. (4, 0.90755945) Relation of user perceived response time to error measurement
  63. (8, 0.050041765) The generation of random binary unordered trees
  64. (7, -0.09879464) The intersection graph of paths in trees
  65. (6, -0.10639259) Graph minors IV Widths of trees and well quasi ordering
  66. (5, -0.12416792) Graph minors A survey
  67. """

Word2Vec

加载预训练模型

  1. import gensim.downloader as api
  2. wv = api.load('word2vec-google-news-300') # load pretrained model
  3. for i, word in enumerate(wv.vocab): # vocab: [word]
  4. if i == 10:
  5. break
  6. print(word)
  7. """
  8. </s>
  9. in
  10. for
  11. that
  12. is
  13. on
  14. ##
  15. The
  16. with
  17. said
  18. """
  19. try:
  20. vec_cameroon = wv['cameroon']
  21. except KeyError:
  22. print("The word 'cameroon' does not appear in this model")
  23. """
  24. The word 'cameroon' does not appear in this model
  25. """
  26. pairs = [
  27. ('car', 'minivan'), # a minivan is a kind of car
  28. ('car', 'bicycle'), # still a wheeled vehicle
  29. ('car', 'airplane'), # ok, no wheels, but still a vehicle
  30. ('car', 'cereal'), # ... and so on
  31. ('car', 'communism'),
  32. ]
  33. for w1, w2 in pairs:
  34. print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2))) # similarity: compute two word cosine similarity
  35. """
  36. 'car' 'minivan' 0.69
  37. 'car' 'bicycle' 0.54
  38. 'car' 'airplane' 0.42
  39. 'car' 'cereal' 0.14
  40. 'car' 'communism' 0.06
  41. """
  42. print(wv.most_similar(positive=['car', 'minivan'], topn=5))
  43. """
  44. [('SUV', 0.853219211101532), ('vehicle', 0.8175784349441528), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.756571888923645)]
  45. """
  46. print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))
  47. """
  48. car
  49. """

训练自己的模型

  1. from gensim.test.utils import datapath
  2. from gensim import utils
  3. # 加载数据,可以是迭代器、生成器
  4. class MyCorpus(object):
  5. """An interator that yields sentences (lists of str)."""
  6. def __iter__(self):
  7. corpus_path = datapath('lee_background.cor')
  8. for line in open(corpus_path):
  9. # assume there's one document per line, tokens separated by whitespace
  10. yield utils.simple_preprocess(line)
  11. import gensim.models
  12. sentences = MyCorpus()
  13. model = gensim.models.Word2Vec(sentences=sentences)
  14. # The trained word vectors are stored in a KeyedVectors instance in model.wv:
  15. vector = model.wv['computer'] # numpy vector of a word
  16. # to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap).
  17. word_vectors = model.wv
  18. del model

Word2Vec参数

  • sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network.
  • min_count (int, optional) – Ignores all words with total frequency lower than this.
  • size (int, optional) – Dimensionality of the word vectors.
  • window (int, optional) – Maximum distance between the current and predicted word within a sentence.
  • workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines). install Cython to make it work!
  • sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
  • cbow_mean ({0, 1}, optional) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
  • hs ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
  • negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
  • sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
  • compute_loss (bool, optional) – If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

    加载与保存

  • 需要继续训练

model.save(filename),model.load(filename)

  • 不需要继续训练 ```python from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors

path = get_tmpfile(“wordvectors.kv”)

model.wv.save(path) wv = KeyedVectors.load(“model.wv”, mmap=’r’) vector = wv[‘computer’] # numpy vector of a word

Gensim can also load word vectors in the “word2vec C format”, as a KeyedVectors instance:

from gensim.test.utils import datapath wv_from_text = KeyedVectors.load_word2vec_format(datapath(‘word2vec_pre_kv_c’), binary=False) # C text format wv_from_bin = KeyedVectors.load_word2vec_format(datapath(“euclidean_vectors.bin”), binary=True) # C bin format

  1. - 只用于初始化词向量和word2index
  2. ```python
  3. import gensim
  4. import numpy as np
  5. # train word2vec
  6. # self.corpus: [[word]]
  7. model = gensim.models.word2vec.Word2Vec(self.corpus,size=300,min_count=1)
  8. # model.save('word2vec_redial')
  9. # save word embedding in numpy format
  10. word2embedding = [[0] * 300] * 4 + [model[word] for word in word2index]+[[0]*300]
  11. print(np.shape(word2embedding))
  12. np.save('word2vec_redial.npy', word2embedding)
  13. # word2index
  14. word2index = {word: i + 4 for i, word in enumerate(model.wv.index2word)}
  15. word2index['_split_']=len(word2index)+4
  16. json.dump(word2index, open('word2index_redial.json', 'w', encoding='utf-8'), ensure_ascii=False)
  17. # initialize word embedding in model
  18. e = nn.Embedding(len(dictionary)+4, embedding_size, padding_idx)
  19. e.weight.data.copy_(torch.from_numpy(np.load('word2vec_redial.npy')))