- 题目
Bag of Tricks for Efficient Text Classification
Enriching word vectors with subword information
- 优点
- 快速进行文本分类,效果与许多深度学习分类器平分秋色。
- 提出使用子词的词向量训练方法,一定程度上解决OOV问题。
- 模型
与CBOW的联系
- 对数线性模型。
- 对输入的词向量做平均然后进行预测。
- 模型结构一样。
与CBOW的区别
- fastText提取的是句子特征,CBOW提取的上下文特征。
- fastText是监督学习,CBOW为无监督学习。
存在的问题与解决方法
- 类别很多的时候,softmax速度慢,采用层次softmax。
- 使用词袋模型,没有词序信息,采用n-gram。
- 代码
数据预处理
from torch.utils import data
import os
import csv
import nltk
import numpy as np
import torch
import torch.nn as nn
import numpy as np
f = open("./data/AG/train.csv")
rows = csv.reader(f, delimiter=',', quotechar='"')
rows = list(rows)
n_gram = 2
lowercase = True
label = []
datas = []
for row in rows:
label.append(int(row[0])-1)
txt = " ".join(row[1:])
if lowercase:
txt = txt.lower()
txt = nltk.word_tokenize(txt) # 将句子转化成词
new_txt= []
for i in range(0,len(txt)):
for j in range(n_gram): # 添加n-gram词
if j<=i:
new_txt.append(" ".join(txt[i-j:i+1]))
datas.append(new_txt)
min_count = 3
word_freq = {}
for data in datas: # 首先统计词频,后续通过词频过滤低频词
for word in data:
if word_freq.get(word) != None:
word_freq[word] +=1
else:
word_freq[word] = 1
word2id = {"<pad>":0,"<unk>":1}
for word in word_freq: # 首先构建uni-gram词,因为不需要hash
if word_freq[word] < min_count or " " in word:
continue
word2id[word] = len(word2id)
uniwords_num = len(word2id)
for word in word_freq: # 构建2-gram以上的词,需要hash
if word_freq[word] < min_count or " " not in word:
continue
word2id[word] = len(word2id)
max_length = 100
for i, data in enumerate(datas):
for j, word in enumerate(data):
if " " not in word:
datas[i][j] = word2id.get(word,1)
else:
datas[i][j] = word2id.get(word, 1) % 100000 + uniwords_num # hash函数
datas[i] = datas[i][0:max_length]+[0]*(max_length-len(datas[i]))
模型
注:torch.nn.``AvgPool1d
input_shape : [N,C,in]
output_shape : [N,C,out]
m = nn.AvgPool1d(3, stride=2) # pool with window of size=3, stride=2
m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
# tensor([[[ 2., 4., 6.]]])
class Fasttext(nn.Module):
def __init__(self, vocab_size, embedding_size, max_length, label_num):
super(Fasttext,self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_size) # 嵌入层
self.avg_pool = nn.AvgPool1d(kernel_size=max_length, stride=1) # 平均层
self.fc = nn.Linear(embedding_size, label_num) # 全连接层
def forward(self, x):
x = x.long()
out = self.embedding(x) # batch_size*length*embedding_size
out = out.transpose(1, 2).contiguous() # batch_size*embedding_size*length
out = self.avg_pool(out).squeeze() # batch_size*embedding_size
out = self.fc(out) # batch_size*label_num
return out
from torchsummary import summary
fasttext = Fasttext(vocab_size=1000,embedding_size=10,max_length=100,label_num=4)
summary(fasttext, input_size=(100,))