• 题目

    Bag of Tricks for Efficient Text Classification
    Enriching word vectors with subword information

    • 优点
      • 快速进行文本分类,效果与许多深度学习分类器平分秋色。
      • 提出使用子词的词向量训练方法,一定程度上解决OOV问题。
    • 模型

    7700AF83-DD53-4965-902E-D91259B70B9B.png
    与CBOW的联系

    • 对数线性模型。
    • 对输入的词向量做平均然后进行预测。
    • 模型结构一样。

    与CBOW的区别

    • fastText提取的是句子特征,CBOW提取的上下文特征。
    • fastText是监督学习,CBOW为无监督学习。

    存在的问题与解决方法

    • 类别很多的时候,softmax速度慢,采用层次softmax。
    • 使用词袋模型,没有词序信息,采用n-gram。
      • 代码

    数据预处理

    1. from torch.utils import data
    2. import os
    3. import csv
    4. import nltk
    5. import numpy as np
    6. import torch
    7. import torch.nn as nn
    8. import numpy as np
    9. f = open("./data/AG/train.csv")
    10. rows = csv.reader(f, delimiter=',', quotechar='"')
    11. rows = list(rows)
    12. n_gram = 2
    13. lowercase = True
    14. label = []
    15. datas = []
    16. for row in rows:
    17. label.append(int(row[0])-1)
    18. txt = " ".join(row[1:])
    19. if lowercase:
    20. txt = txt.lower()
    21. txt = nltk.word_tokenize(txt) # 将句子转化成词
    22. new_txt= []
    23. for i in range(0,len(txt)):
    24. for j in range(n_gram): # 添加n-gram词
    25. if j<=i:
    26. new_txt.append(" ".join(txt[i-j:i+1]))
    27. datas.append(new_txt)
    28. min_count = 3
    29. word_freq = {}
    30. for data in datas: # 首先统计词频,后续通过词频过滤低频词
    31. for word in data:
    32. if word_freq.get(word) != None:
    33. word_freq[word] +=1
    34. else:
    35. word_freq[word] = 1
    36. word2id = {"<pad>":0,"<unk>":1}
    37. for word in word_freq: # 首先构建uni-gram词,因为不需要hash
    38. if word_freq[word] < min_count or " " in word:
    39. continue
    40. word2id[word] = len(word2id)
    41. uniwords_num = len(word2id)
    42. for word in word_freq: # 构建2-gram以上的词,需要hash
    43. if word_freq[word] < min_count or " " not in word:
    44. continue
    45. word2id[word] = len(word2id)
    46. max_length = 100
    47. for i, data in enumerate(datas):
    48. for j, word in enumerate(data):
    49. if " " not in word:
    50. datas[i][j] = word2id.get(word,1)
    51. else:
    52. datas[i][j] = word2id.get(word, 1) % 100000 + uniwords_num # hash函数
    53. datas[i] = datas[i][0:max_length]+[0]*(max_length-len(datas[i]))

    模型
    注:torch.nn.``AvgPool1d

    1. input_shape : [N,C,in]
    2. output_shape : [N,C,out]
    3. m = nn.AvgPool1d(3, stride=2) # pool with window of size=3, stride=2
    4. m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
    5. # tensor([[[ 2., 4., 6.]]])

    fastText - 图2

    class Fasttext(nn.Module):
        def __init__(self, vocab_size, embedding_size, max_length, label_num):
            super(Fasttext,self).__init__()
            self.embedding = nn.Embedding(vocab_size, embedding_size)  # 嵌入层
            self.avg_pool = nn.AvgPool1d(kernel_size=max_length, stride=1) # 平均层
            self.fc = nn.Linear(embedding_size, label_num) # 全连接层
        def forward(self, x):
            x = x.long()
            out = self.embedding(x) # batch_size*length*embedding_size
            out = out.transpose(1, 2).contiguous() # batch_size*embedding_size*length
            out = self.avg_pool(out).squeeze() # batch_size*embedding_size
            out = self.fc(out) # batch_size*label_num
            return out
    
    from torchsummary import summary
    fasttext = Fasttext(vocab_size=1000,embedding_size=10,max_length=100,label_num=4)
    summary(fasttext, input_size=(100,))