• 论文题目

    Global vectors for word representation

    • 共现矩阵

    Glove - 图1
    Glove - 图2

    ratio(i,j,k) 单词j,k相关 单词j,k不相关
    单词i,k相关 趋近1 很大
    单词i,k不相关 很小 趋近1

    Glove - 图3

    • 代码

    数据预处理

    1. data = open("./data/text8.txt").read()
    2. data = data.split()
    3. # 构建word2id并去除低频词
    4. word2freq = {}
    5. for word in data:
    6. if word2freq.get(word)!=None:
    7. word2freq[word] += 1
    8. else:
    9. word2freq[word] = 1
    10. word2id = {}
    11. for word in word2freq:
    12. if word2freq[word]<min_count:
    13. continue
    14. else:
    15. if word2id.get(word)==None:
    16. word2id[word]=len(word2id)
    17. vocab_size = len(word2id)
    18. comat = np.zeros((vocab_size,vocab_size))
    19. for i in range(len(data)):
    20. if word2id.get(data[i])==None:
    21. continue
    22. w_index = word2id[data[i]]
    23. for j in range(max(0,i-window_size),min(len(data),i+window_size+1)):
    24. if word2id.get(data[j]) == None or i==j:
    25. continue
    26. u_index = word2id[data[j]]
    27. comat[w_index][u_index] += 1
    28. coocs = np.transpose(np.nonzero(comat)) # np.nonzero(comat) 返回维度[2,n] 第一行表示行索引,第二行表示列索引
    29. labels = []
    30. for i in range(len(coocs)):
    31. labels.append(comat[coocs[i][0]][coocs[i][1]])
    32. labels = np.array(labels)

    模型定义

    1. class glove_model(nn.Module):
    2. def __init__(self, vocab_size, embed_size, x_max, alpha):
    3. super(glove_model, self).__init__()
    4. self.vocab_size = vocab_size
    5. self.embed_size = embed_size
    6. self.x_max = x_max
    7. self.alpha = alpha
    8. self.w_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 中心词向量
    9. self.w_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 中心词bias
    10. self.v_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 周围词向量
    11. self.v_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 周围词bias
    12. def forward(self, w_data, v_data, labels):
    13. w_data_embed = self.w_embed(w_data) # bs*embed_size
    14. w_data_bias = self.w_bias(w_data) # bs*1
    15. v_data_embed = self.v_embed(v_data)
    16. v_data_bias = self.v_bias(v_data)
    17. weights = torch.pow(labels/self.x_max, self.alpha) # 权重生成
    18. weights[weights>1]=1
    19. loss = torch.mean(weights*torch.pow(torch.sum(w_data_embed*v_data_embed,1)+w_data_bias+v_data_bias-
    20. torch.log(labels),2)) # 计算loss
    21. return loss
    22. def save_embedding(self, word2id, file_name):
    23. embedding_1 = self.w_embed.weight.data.cpu().numpy()
    24. embedding_2 = self.v_embed.weight.data.cpu().numpy()
    25. embedding = (embedding_1+embedding_2)/2
    26. fout = open(file_name, 'w')
    27. fout.write('%d %d\n' % (len(word2id), self.embed_size))
    28. for w, wid in word2id.items():
    29. e = embedding[wid]
    30. e = ' '.join(map(lambda x: str(x), e))
    31. fout.write('%s %s\n' % (w, e))

    训练

    1. loss = -1
    2. for epoch in range(config.epoch):
    3. process_bar = tqdm(training_iter) # tqdm训练
    4. for data, label in process_bar:
    5. w_data = torch.Tensor(np.array([sample[0] for sample in data])).long()
    6. v_data = torch.Tensor(np.array([sample[1] for sample in data])).long()
    7. if config.cuda and torch.cuda.is_available():
    8. w_data = w_data.cuda()
    9. v_data = v_data.cuda()
    10. label = label.cuda()
    11. loss_now = model(w_data,v_data,label)
    12. if loss == -1:
    13. loss = loss_now.data.item()
    14. else:
    15. loss = 0.95*loss+0.05*loss_now.data.item() # 平滑loss
    16. process_bar.set_postfix(loss=loss) # 输出loss
    17. process_bar.update()
    18. optimizer.zero_grad() # 梯度更新
    19. loss_now.backward()
    20. optimizer.step()
    21. model.save_embedding()

    测试

    • 总结

    Glove融合了矩阵分解Latent Semantic Analysis (LSA)的全局统计信息和local context window优势。融入全局的先验统计信息,可以加快模型的训练速度,又可以控制词的相对权重。