- 论文题目
Global vectors for word representation
- 共现矩阵
| ratio(i,j,k) | 单词j,k相关 | 单词j,k不相关 |
|---|---|---|
| 单词i,k相关 | 趋近1 | 很大 |
| 单词i,k不相关 | 很小 | 趋近1 |
- 代码
数据预处理
data = open("./data/text8.txt").read()data = data.split()# 构建word2id并去除低频词word2freq = {}for word in data:if word2freq.get(word)!=None:word2freq[word] += 1else:word2freq[word] = 1word2id = {}for word in word2freq:if word2freq[word]<min_count:continueelse:if word2id.get(word)==None:word2id[word]=len(word2id)vocab_size = len(word2id)comat = np.zeros((vocab_size,vocab_size))for i in range(len(data)):if word2id.get(data[i])==None:continuew_index = word2id[data[i]]for j in range(max(0,i-window_size),min(len(data),i+window_size+1)):if word2id.get(data[j]) == None or i==j:continueu_index = word2id[data[j]]comat[w_index][u_index] += 1coocs = np.transpose(np.nonzero(comat)) # np.nonzero(comat) 返回维度[2,n] 第一行表示行索引,第二行表示列索引labels = []for i in range(len(coocs)):labels.append(comat[coocs[i][0]][coocs[i][1]])labels = np.array(labels)
模型定义
class glove_model(nn.Module):def __init__(self, vocab_size, embed_size, x_max, alpha):super(glove_model, self).__init__()self.vocab_size = vocab_sizeself.embed_size = embed_sizeself.x_max = x_maxself.alpha = alphaself.w_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 中心词向量self.w_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 中心词biasself.v_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 周围词向量self.v_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 周围词biasdef forward(self, w_data, v_data, labels):w_data_embed = self.w_embed(w_data) # bs*embed_sizew_data_bias = self.w_bias(w_data) # bs*1v_data_embed = self.v_embed(v_data)v_data_bias = self.v_bias(v_data)weights = torch.pow(labels/self.x_max, self.alpha) # 权重生成weights[weights>1]=1loss = torch.mean(weights*torch.pow(torch.sum(w_data_embed*v_data_embed,1)+w_data_bias+v_data_bias-torch.log(labels),2)) # 计算lossreturn lossdef save_embedding(self, word2id, file_name):embedding_1 = self.w_embed.weight.data.cpu().numpy()embedding_2 = self.v_embed.weight.data.cpu().numpy()embedding = (embedding_1+embedding_2)/2fout = open(file_name, 'w')fout.write('%d %d\n' % (len(word2id), self.embed_size))for w, wid in word2id.items():e = embedding[wid]e = ' '.join(map(lambda x: str(x), e))fout.write('%s %s\n' % (w, e))
训练
loss = -1for epoch in range(config.epoch):process_bar = tqdm(training_iter) # tqdm训练for data, label in process_bar:w_data = torch.Tensor(np.array([sample[0] for sample in data])).long()v_data = torch.Tensor(np.array([sample[1] for sample in data])).long()if config.cuda and torch.cuda.is_available():w_data = w_data.cuda()v_data = v_data.cuda()label = label.cuda()loss_now = model(w_data,v_data,label)if loss == -1:loss = loss_now.data.item()else:loss = 0.95*loss+0.05*loss_now.data.item() # 平滑lossprocess_bar.set_postfix(loss=loss) # 输出lossprocess_bar.update()optimizer.zero_grad() # 梯度更新loss_now.backward()optimizer.step()model.save_embedding()
测试
- 总结
Glove融合了矩阵分解Latent Semantic Analysis (LSA)的全局统计信息和local context window优势。融入全局的先验统计信息,可以加快模型的训练速度,又可以控制词的相对权重。
