- 论文题目
Global vectors for word representation
- 共现矩阵
ratio(i,j,k) | 单词j,k相关 | 单词j,k不相关 |
---|---|---|
单词i,k相关 | 趋近1 | 很大 |
单词i,k不相关 | 很小 | 趋近1 |
- 代码
数据预处理
data = open("./data/text8.txt").read()
data = data.split()
# 构建word2id并去除低频词
word2freq = {}
for word in data:
if word2freq.get(word)!=None:
word2freq[word] += 1
else:
word2freq[word] = 1
word2id = {}
for word in word2freq:
if word2freq[word]<min_count:
continue
else:
if word2id.get(word)==None:
word2id[word]=len(word2id)
vocab_size = len(word2id)
comat = np.zeros((vocab_size,vocab_size))
for i in range(len(data)):
if word2id.get(data[i])==None:
continue
w_index = word2id[data[i]]
for j in range(max(0,i-window_size),min(len(data),i+window_size+1)):
if word2id.get(data[j]) == None or i==j:
continue
u_index = word2id[data[j]]
comat[w_index][u_index] += 1
coocs = np.transpose(np.nonzero(comat)) # np.nonzero(comat) 返回维度[2,n] 第一行表示行索引,第二行表示列索引
labels = []
for i in range(len(coocs)):
labels.append(comat[coocs[i][0]][coocs[i][1]])
labels = np.array(labels)
模型定义
class glove_model(nn.Module):
def __init__(self, vocab_size, embed_size, x_max, alpha):
super(glove_model, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
self.x_max = x_max
self.alpha = alpha
self.w_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 中心词向量
self.w_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 中心词bias
self.v_embed = nn.Embedding(self.vocab_size, self.embed_size).type(torch.float64) # 周围词向量
self.v_bias = nn.Embedding(self.vocab_size, 1).type(torch.float64) # 周围词bias
def forward(self, w_data, v_data, labels):
w_data_embed = self.w_embed(w_data) # bs*embed_size
w_data_bias = self.w_bias(w_data) # bs*1
v_data_embed = self.v_embed(v_data)
v_data_bias = self.v_bias(v_data)
weights = torch.pow(labels/self.x_max, self.alpha) # 权重生成
weights[weights>1]=1
loss = torch.mean(weights*torch.pow(torch.sum(w_data_embed*v_data_embed,1)+w_data_bias+v_data_bias-
torch.log(labels),2)) # 计算loss
return loss
def save_embedding(self, word2id, file_name):
embedding_1 = self.w_embed.weight.data.cpu().numpy()
embedding_2 = self.v_embed.weight.data.cpu().numpy()
embedding = (embedding_1+embedding_2)/2
fout = open(file_name, 'w')
fout.write('%d %d\n' % (len(word2id), self.embed_size))
for w, wid in word2id.items():
e = embedding[wid]
e = ' '.join(map(lambda x: str(x), e))
fout.write('%s %s\n' % (w, e))
训练
loss = -1
for epoch in range(config.epoch):
process_bar = tqdm(training_iter) # tqdm训练
for data, label in process_bar:
w_data = torch.Tensor(np.array([sample[0] for sample in data])).long()
v_data = torch.Tensor(np.array([sample[1] for sample in data])).long()
if config.cuda and torch.cuda.is_available():
w_data = w_data.cuda()
v_data = v_data.cuda()
label = label.cuda()
loss_now = model(w_data,v_data,label)
if loss == -1:
loss = loss_now.data.item()
else:
loss = 0.95*loss+0.05*loss_now.data.item() # 平滑loss
process_bar.set_postfix(loss=loss) # 输出loss
process_bar.update()
optimizer.zero_grad() # 梯度更新
loss_now.backward()
optimizer.step()
model.save_embedding()
测试
- 总结
Glove融合了矩阵分解Latent Semantic Analysis (LSA)的全局统计信息和local context window优势。融入全局的先验统计信息,可以加快模型的训练速度,又可以控制词的相对权重。