exploring_word_vectors.pdf

  • 导入包 ```python

    All Import Statements Defined Here

    Note: Do not add to this list.

    All the dependencies you need, can be installed by running .

    ————————

import sys assert sys.version_info[0]==3 assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors from gensim.test.utils import datapath import pprint import matplotlib.pyplot as plt plt.rcParams[‘figure.figsize’] = [10, 5] import nltk nltk.download(‘reuters’) from nltk.corpus import reuters import numpy as np import random import scipy as sp from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import PCA

START_TOKEN = ‘‘ END_TOKEN = ‘

np.random.seed(0) random.seed(0)

————————

  1. <a name="iCCJi"></a>
  2. # 基于计数的词向量
  3. - 读入语料库
  4. ```python
  5. def read_corpus(category="crude"):
  6. """ Read files from the specified Reuter's category.
  7. Params:
  8. category (string): category name
  9. Return:
  10. list of lists, with words from each of the processed files
  11. """
  12. files = reuters.fileids(category)
  13. return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]
  1. reuters_corpus = read_corpus()
  2. pprint.pprint(reuters_corpus[:3], compact=True, width=100)
  • 将语料库中不同的单词存储在一个 list(corpus_words)中,并计算不同单词总数

    1. def distinct_words(corpus):
    2. """ Determine a list of distinct words for the corpus.
    3. Params:
    4. corpus (list of list of strings): corpus of documents
    5. Return:
    6. corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
    7. num_corpus_words (integer): number of distinct words across the corpus
    8. """
    9. corpus_words = []
    10. num_corpus_words = -1
    11. # ------------------
    12. # Write your implementation here.
    13. corpus_words = sorted(list(set([word for words_list in corpus for word in words_list])))
    14. num_corpus_words = len(corpus_words)
    15. # ------------------
    16. return corpus_words, num_corpus_words
  • 计算 co-occurrence matrix(计数的方式)

Assignment 1: 探索词向量(Word Vectors) - 图1

  1. def compute_co_occurrence_matrix(corpus, window_size=4):
  2. """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
  3. Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
  4. number of co-occurring words.
  5. For example, if we take the document "START All that glitters is not gold END" with window size of 4,
  6. "All" will co-occur with "START", "that", "glitters", "is", and "not".
  7. Params:
  8. corpus (list of list of strings): corpus of documents
  9. window_size (int): size of context window
  10. Return:
  11. M (numpy matrix of shape (number of corpus words, number of corpus words)):
  12. Co-occurence matrix of word counts.
  13. The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
  14. word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
  15. """
  16. words, num_words = distinct_words(corpus)
  17. M = None
  18. word2Ind = {}
  19. # ------------------
  20. # Write your implementation here.
  21. M = np.zeros((num_words, num_words))
  22. word2Ind = dict(zip(words, range(num_words)))
  23. for doc in corpus:
  24. doc_len = len(doc)
  25. for current_idx in range(0, doc_len):
  26. left_boundary = max(current_idx - window_size, 0)
  27. right_boundary = min(current_idx + window_size + 1, doc_len)
  28. outside_words = doc[left_boundary : current_idx] + doc[current_idx + 1 : right_boundary]
  29. center_word = doc[current_idx]
  30. center_idx = word2Ind[center_word]
  31. for outside_word in outside_words:
  32. outside_idx = word2Ind[outside_word]
  33. M[outside_idx, center_idx] += 1
  34. # ------------------
  35. return M, word2Ind
  • 使用 TruncatedSVD 将 co-occurrence matrix 降维为 k 维

    • 降维后矩阵大小:行数=语料库中不同单词总数,列数=k,即每个单词用 k 维向量表示

      1. def reduce_to_k_dim(M, k=2):
      2. """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
      3. to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
      4. - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
      5. Params:
      6. M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
      7. k (int): embedding size of each word after dimension reduction
      8. Return:
      9. M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
      10. In terms of the SVD from math class, this actually returns U * S
      11. """
      12. n_iters = 10 # Use this parameter in your call to `TruncatedSVD`
      13. M_reduced = None
      14. print("Running Truncated SVD over %i words..." % (M.shape[0]))
      15. # ------------------
      16. # Write your implementation here.
      17. svd = TruncatedSVD(n_components=k, n_iter=n_iters)
      18. M_reduced = svd.fit_transform(M)
      19. # ------------------
      20. print("Done.")
      21. return M_reduced
  • 打印词向量 / 词嵌入的前两维

    1. def plot_embeddings(M_reduced, word2Ind, words):
    2. """ Plot in a scatterplot the embeddings of the words specified in the list "words".
    3. NOTE: do not plot all the words listed in M_reduced / word2Ind.
    4. Include a label next to each point.
    5. Params:
    6. M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddings
    7. word2Ind (dict): dictionary that maps word to indices for matrix M
    8. words (list of strings): words whose embeddings we want to visualize
    9. """
    10. # ------------------
    11. # Write your implementation here.
    12. x_coords = M_reduced[:, 0]
    13. y_corrds = M_reduced[:, 1]
    14. for word in words:
    15. idx = word2Ind[word]
    16. embedding = M_reduced[idx]
    17. x = embedding[0]
    18. y = embedding[1]
    19. plt.scatter(x, y, marker='x', color='red')
    20. plt.text(x, y, word, fontsize=9)
    21. # ------------------
  • 汇总,打印 ```python

    ——————————————-

    Run This Cell to Produce Your Plot

    ———————————————

    reuters_corpus = read_corpus() M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus) M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

Rescale (normalize) the rows to make them each of unit-length

M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1) M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

words = [‘barrels’, ‘bpd’, ‘ecuador’, ‘energy’, ‘industry’, ‘kuwait’, ‘oil’, ‘output’, ‘petroleum’, ‘venezuela’] plot_embeddings(M_normalized, word2Ind_co_occurrence, words)

  1. ![](https://cdn.nlark.com/yuque/0/2021/png/1324638/1618576631839-3327467d-bc4b-4dd8-9674-1cbb854dc986.png#clientId=u3f2a656e-add6-4&from=paste&id=u3f9b511c&margin=%5Bobject%20Object%5D&originHeight=303&originWidth=616&originalType=url&size=11462&status=done&style=none&taskId=u118dd468-085b-4572-9b5c-e226eb41305)
  2. <a name="OWNn2"></a>
  3. # 基于预测的词向量——word2vec
  4. - 加载 word2vec 词向量
  5. ```python
  6. def load_word2vec():
  7. """ Load Word2Vec Vectors
  8. Return:
  9. wv_from_bin: All 3 million embeddings, each lengh 300
  10. """
  11. import gensim.downloader as api
  12. wv_from_bin = api.load("word2vec-google-news-300")
  13. vocab = list(wv_from_bin.vocab.keys())
  14. print("Loaded vocab size %i" % len(vocab))
  15. return wv_from_bin
  1. # -----------------------------------
  2. # Run Cell to Load Word Vectors
  3. # Note: This may take several minutes
  4. # -----------------------------------
  5. wv_from_bin = load_word2vec()
  • 将 3w word2vec 词向量装入矩阵 M,并使用 TruncatedSVD 降维

    1. def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
    2. """ Put the word2vec vectors into a matrix M.
    3. Param:
    4. wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
    5. Return:
    6. M: numpy matrix shape (num words, 300) containing the vectors
    7. word2Ind: dictionary mapping each word to its row number in M
    8. """
    9. import random
    10. words = list(wv_from_bin.vocab.keys())
    11. print("Shuffling words ...")
    12. random.shuffle(words)
    13. words = words[:10000]
    14. print("Putting %i words into word2Ind and matrix M..." % len(words))
    15. word2Ind = {}
    16. M = []
    17. curInd = 0
    18. for w in words:
    19. try:
    20. M.append(wv_from_bin.word_vec(w))
    21. word2Ind[w] = curInd
    22. curInd += 1
    23. except KeyError:
    24. continue
    25. for w in required_words:
    26. try:
    27. M.append(wv_from_bin.word_vec(w))
    28. word2Ind[w] = curInd
    29. curInd += 1
    30. except KeyError:
    31. continue
    32. M = np.stack(M)
    33. print("Done.")
    34. return M, word2Ind
    1. # -----------------------------------------------------------------
    2. # Run Cell to Reduce 300-Dimensinal Word Embeddings to k Dimensions
    3. # Note: This may take several minutes
    4. # -----------------------------------------------------------------
    5. M, word2Ind = get_matrix_of_vectors(wv_from_bin) # 将word2vec词向量装入矩阵
    6. M_reduced = reduce_to_k_dim(M, k=2) # 降维
  • 绘图

    1. words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
    2. plot_embeddings(M_reduced, word2Ind, words)

    Assignment 1: 探索词向量(Word Vectors) - 图2

  • 探索多义词 ```python

    —————————

    Write your polysemous word exploration code here.

wv_from_bin.most_similar(“party”)

—————————

  1. 以上代码输出了和“party”最相似的十个单词,可以看出 party 是个多义词,有“派对”的意思,也有“政党”的意思

[(‘Party’, 0.7125184535980225), (‘parties’, 0.6745480298995972), (‘partys’, 0.5965836644172668), (‘Democratic_Party’, 0.5447009801864624), (‘LOUDON_NH_Brad_Keselowski’, 0.5346890687942505), (‘caucus’, 0.522636890411377), (‘pary’, 0.5175394415855408), (‘faction’, 0.5168994665145874), (‘mad_hatter_tea’, 0.507461428642273), (‘Labour_Party’, 0.4938312768936157)]

  1. - 有时候,同义词之间的余弦相似度要低于反义词之间的余弦相似度
  2. ```python
  3. # ------------------
  4. # Write your synonym & antonym exploration code here.
  5. w1 = "happy"
  6. w2 = "cheerful"
  7. w3 = "sad"
  8. w1_w2_dist = wv_from_bin.distance(w1, w2)
  9. w1_w3_dist = wv_from_bin.distance(w1, w3)
  10. print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
  11. print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))
  12. # ------------------

从以下输出可以看出,虽然“happy”和“cheerful”是同义词,“happy”和“sad”是反义词,但前者之间的余弦距离要大于后者之间的余弦距离:

  1. Synonyms happy, cheerful have cosine distance: 0.6162261962890625
  2. Antonyms happy, sad have cosine distance: 0.46453857421875
  • 使用词向量实现类比:

    1. # Run this cell to answer the analogy -- man : king :: woman : x
    2. pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

    king + woman - man = queen:

    1. [('queen', 0.7118192911148071),
    2. ('monarch', 0.6189674139022827),
    3. ('princess', 0.5902431607246399),
    4. ('crown_prince', 0.5499460697174072),
    5. ('prince', 0.5377321243286133),
    6. ('kings', 0.5236844420433044),
    7. ('Queen_Consort', 0.5235945582389832),
    8. ('queens', 0.5181134343147278),
    9. ('sultan', 0.5098593235015869),
    10. ('monarchy', 0.5087411999702454)]

    Assignment 1: 探索词向量(Word Vectors) - 图3

  • 但也可能出现错误的类比情况: ```python

    —————————

    Write your incorrect analogy exploration code here.

pprint.pprint(wv_from_bin.most_similar(positive=[‘china’, ‘japanese’], negative=[‘japan’]))

—————————

  1. 输出的并不是 chinese

[(‘porcelain’, 0.5757269263267517), (‘dinnerware’, 0.563517689704895), (‘crockery’, 0.5430431365966797), (‘silver_flatware’, 0.540193498134613), (‘crystal_stemware’, 0.5391061902046204), (‘flatware’, 0.5293956398963928), (‘tableware’, 0.5281988978385925), (‘china_plates’, 0.5269104838371277), (‘bone_china’, 0.5260535478591919), (‘transferware’, 0.5206524133682251)]

  1. - 词向量中偏见(eg.性别、人种、性取向)的引导分析
  2. ```python
  3. # Run this cell
  4. # Here `positive` indicates the list of words to be similar to and `negative` indicates the list of words to be
  5. # most dissimilar from.
  6. pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man']))
  7. print()
  8. pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))

输出:

  1. [('bosses', 0.5522644519805908),
  2. ('manageress', 0.49151360988616943),
  3. ('exec', 0.45940813422203064),
  4. ('Manageress', 0.45598435401916504),
  5. ('receptionist', 0.4474116563796997),
  6. ('Jane_Danson', 0.44480544328689575),
  7. ('Fiz_Jennie_McAlpine', 0.44275766611099243),
  8. ('Coronation_Street_actress', 0.44275566935539246),
  9. ('supremo', 0.4409853219985962),
  10. ('coworker', 0.43986251950263977)]
  11. [('supremo', 0.6097398400306702),
  12. ('MOTHERWELL_boss', 0.5489562153816223),
  13. ('CARETAKER_boss', 0.5375303626060486),
  14. ('Bully_Wee_boss', 0.5333974361419678),
  15. ('YEOVIL_Town_boss', 0.5321705341339111),
  16. ('head_honcho', 0.5281980037689209),
  17. ('manager_Stan_Ternent', 0.525971531867981),
  18. ('Viv_Busby', 0.5256162881851196),
  19. ('striker_Gabby_Agbonlahor', 0.5250812768936157),
  20. ('BARNSLEY_boss', 0.5238943099975586)]