Pytorch中的nn.Embedding
class torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False)
参数说明
- num_embeddings (int) - 嵌入字典的大小
- embedding_dim (int) - 每个嵌入向量的大小
- padding_idx (int, optional) - 如果提供的话,输出遇到此下标时用零填充
- max_norm (float, optional) - 如果提供的话,会重新归一化词嵌入,使它们的范数小于提供的值
- norm_type (float, optional) - 对于max_norm选项计算p范数时的p
scale_grad_by_freq (boolean, optional) - 如果提供的话,会根据字典中单词频率缩放梯度
变量
weight (Tensor) -形状为(num_embeddings, embedding_dim)的模块中可学习的权值
形状
输入: LongTensor (N, W), N = mini-batch, W = 每个mini-batch中提取的下标数
- 输出: (N, W, embedding_dim)
例子
```python import torch.nn as nn embedding = nn.Embedding(10, 3) embedding.weight.data Out[4]: tensor([[ 0.3582, 0.8383, 1.5491],[ 0.0932, -0.5001, -0.3267],[-1.2723, -0.5441, 1.2399],[-0.2568, 0.3505, -0.3154],[-0.1027, -0.2083, -1.3749],[-0.5723, -0.5140, 0.7822],[-0.3028, -0.3044, -0.3639],[ 0.4530, 1.2876, -0.5515],[-1.0198, 0.5734, -1.1364],[ 1.1018, 1.1473, 0.3840]])
按索引取出向量
embedding(torch.tensor([[1, 2, 4], [5, 6,4]]))
Out[9]:
tensor([[[ 0.0932, -0.5001, -0.3267],
[-1.2723, -0.5441, 1.2399],
[-0.1027, -0.2083, -1.3749]],
[[-0.5723, -0.5140, 0.7822],
[-0.3028, -0.3044, -0.3639],
[-0.1027, -0.2083, -1.3749]]], grad_fn=
<a name="oBCdF"></a>
### 实战
<a name="BtIJy"></a>
#### 简单清洗数据
```python
import re
import jieba
if __name__ == '__main__':
tran_file = open("train_txt.txt", "w", encoding="utf-8")
with open("./data/mmi_text.txt", "r", encoding="utf-8") as rf:
for line in rf.readlines():
# 去掉标点符号
new_line = re.sub("[??,,.。!!/、@#\$%\^&\*\(\)()《》<>\{\}+::【】]|\[.*?\]|[a-z A-Z]", "", line)
# 去掉文章表情
new_line = re.sub('[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]+', "", new_line)
new_line = re.sub("⃣", "", new_line)
# 去掉数字和中文符合等
new_line = re.sub("[「」·〗〖¥\-—~\丨~]|[0-9]|[①②③④⑤⑥⑦⑧⑨🅿▪]|[\r|\n ]+", "", new_line)
if new_line.strip() == "":
continue
# 分词
keyword_list = jieba.cut(new_line, cut_all=False)
# 将分词以空格进行拼接
keyword_str = " ".join(keyword_list)
tran_file.write(keyword_str + " ")
tran_file.close()
初始化参数
# 选取左右多少个单词作为背景词
C = 3
# 表示随机选取15个噪声词
K = 15
MAX_VOCAB_SIZE = 30000
EMBEDDING_SIZE = 200
batch_size = 32
lr = 0.02
C就是论文中选取左右多少个单词作为背景词。这里使用的是负采样来近似训练,K=15表示随机选取15个噪声词。MAX_VOCAB_SIZE=30000表示这次实验我准备训练30000个词的词向量,但实际上我只会选出语料库中出现次数最多的
建立词典
with open("./train_txt.txt", 'r', encoding="utf-8") as rf:
text = rf.read().strip()
# 分割成单词列表
text = text.lower().split()
# 得到单词字典表,key是单词,value是次数
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
# 把不常用的单词都编码为"<UNK>"
vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values()))
idx2word = [word for word in vocab_dict.keys()]
word2idx = {word:i for i, word in enumerate(idx2word)}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
最后一行代码,word_freqs存储了每个单词的频率,然后又将所有的频率变为原来的0.75次方,这是因为word2vec论文里面推荐这么做,当然你不改变这个值也没什么问题。
创建数据集
为了使用DataLoader,我们需要定义以下两个function
- len():返回整个数据集有多少item
getitem(idx):根据给定的idx返回一个item
class WordEmbeddingDataset(tud.Dataset): def __init__(self, text, word2idx, idx2word, word_freqs, word_counts): ''' text: a list of words, all text from the training dataset word2idx: the dictionary from word to index idx2word: index to word mapping word_freqs: the frequency of each word word_counts: the word counts ''' super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型,然后重写两个方法 self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把单词数字化表示。如果不在词典中,也表示为unk self.text_encoded = torch.LongTensor(self.text_encoded) # nn.Embedding需要传入LongTensor类型 self.word2idx = word2idx self.idx2word = idx2word self.word_freqs = torch.Tensor(word_freqs) self.word_counts = torch.Tensor(word_counts) def __len__(self): return len(self.text_encoded) # 返回所有单词的总数,即item的总数 def __getitem__(self, idx): ''' 这个function返回以下数据用于训练 - 中心词 - 这个单词附近的positive word - 随机采样的K个单词作为negative word ''' # 取得中心词 center_words = self.text_encoded[idx] # 先取得中心左右各C个词的索引 pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1)) # 为了避免索引越界,所以进行取余处理 pos_indices = [i % len(self.text_encoded) for i in pos_indices] pos_words = self.text_encoded[pos_indices] # tensor(list) neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True) # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值,输出的是self.word_freqs对应的下标 # 取样方式采用有放回的采样,并且self.word_freqs数值越大,取样概率越大 # 每采样一个正确的单词(positive word),就采样K个错误的单词(negative word),pos_words.shape[0]是正确单词数量 return center_words, pos_words, neg_words构建Embedding模型
class EmbeddingModel(nn.Module): def __init__(self, vocab_size, embed_size): super(EmbeddingModel, self).__init__() self.vocab_size = vocab_size self.embed_size = embed_size self.in_embed = nn.Embedding(self.vocab_size, self.embed_size) self.out_embed = nn.Embedding(self.vocab_size, self.embed_size) def forward(self, input_labels, pos_labels, neg_labels): input_embedding = self.in_embed(input_labels) # [batch_size, embed_size] pos_embedding = self.in_embed(pos_labels) # [batch_size, (window * 2), embed_size] neg_embedding = self.in_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size] input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1] pos_dot = torch.bmm(pos_embedding, input_embedding) # [batch_size, (window * 2), 1] pos_dot = pos_dot.squeeze(2) # [batch_size, (window * 2)] neg_dot = torch.bmm(neg_embedding, -input_embedding) # [batch_size, (window * 2 * K), 1] neg_dot = neg_dot.squeeze(2) # batch_size, (window * 2 * K)] log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数,.sum(1)结果是一维的张量 log_neg = F.logsigmoid(neg_dot).sum(1) loss = log_pos + log_neg return -loss def input_embedding(self): return self.in_embed.weight.cpu().detach().numpy()训练模型
```python dataset = WordEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts) dataloader = tud.DataLoader(dataset, batch_size, shuffle=True) model = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
for e in range(10): for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader): input_labels = input_labels.long() pos_labels = pos_labels.long() neg_labels = neg_labels.long()
optimizer.zero_grad()
loss = model(input_labels, pos_labels, neg_labels).mean()
loss.backward()
optimizer.step()
if i % 100 == 0:
print('epoch', e, 'iteration', i, loss.item())
embedding_weights = model.input_embedding() torch.save(model.state_dict(), “embedding-{}.th”.format(EMBEDDING_SIZE)) ```
