赛题

https://www.kaggle.com/c/nlp-getting-started/

升级方案

原模型实现混乱,训练集没有很好的划分出测试集
网络较为简单,过拟合现象严重

升级:

  1. 再加上一层线性层
  2. 再加上一层Dropout层
  3. 加上若干层的激活函数
  4. 调整选取最佳训练效果的代码
  5. 调整测试集划分

模型

datalabel len: 6000
evallabel len: 1613
datalist len: 6000
evallist len: 1613
datalen: 25
using GPU
embedding_dim=300
hidden_dim=256
vocab_size=len(dict)
target=2
Batchsize=16
stringlen=25
Epoch=20
lr=0.005

  1. LSTMTagger(
  2. (word_embeddings): Embedding(45802, 300)
  3. (lstm): LSTM(300, 256)
  4. (dropout1): Dropout(p=0.5, inplace=False)
  5. (dense): Linear(in_features=6400, out_features=256, bias=True)
  6. (op_prelu): PReLU(num_parameters=1)
  7. (dropout2): Dropout(p=0.5, inplace=False)
  8. (hidden2tag): Linear(in_features=256, out_features=2, bias=True)
  9. (op_tanh): Tanh()
  10. )

代码

  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MinMaxScaler
  4. import time
  5. import copy
  6. import torch.nn as nn
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. import torch.autograd as autograd
  10. import torch.nn.functional
  11. from torch.utils.data import Dataset, DataLoader
  12. from torchvision import transforms
  13. import warnings
  14. import torch
  15. import time
  16. USE_CUDA = torch.cuda.is_available()
  17. base_path=""
  18. read_train=pd.read_csv(base_path+'train.csv')
  19. train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
  20. train_label=read_train.iloc[0:read_train.shape[0],[4]]
  21. train_label=[i for i in train_label.target]
  22. read_test = pd.read_csv(base_path + 'test.csv')
  23. test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
  24. a=[]
  25. sentence=""
  26. for i in range(0, len(train_data)):
  27. sentence=sentence+str(train_data.iloc[i]['keyword'])
  28. sentence+=" "
  29. sentence=sentence+str(train_data.iloc[i]['location'])
  30. sentence += " "
  31. sentence=sentence+str(train_data.iloc[i]['text'])
  32. sentence += " "
  33. for i in range(0, len(test_data)):
  34. sentence=sentence+str(test_data.iloc[i]['keyword'])
  35. sentence+=" "
  36. sentence=sentence+str(test_data.iloc[i]['location'])
  37. sentence += " "
  38. sentence=sentence+str(test_data.iloc[i]['text'])
  39. sentence += " "
  40. dict = sentence.split()
  41. dict=set(dict)
  42. print(len(dict))
  43. w2i={}
  44. def word2index():
  45. index=0
  46. for i in dict:
  47. w2i[i]=index
  48. index+=1
  49. word2index()
  50. train_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in train_data.keyword]
  51. train_data['location'] = [[w2i[i] for i in str(x).split()] for x in train_data.location]
  52. train_data['text'] = [[w2i[i] for i in str(x).split()] for x in train_data.text]
  53. #print(train_data[:read_train.shape[0]])
  54. test_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in test_data.keyword]
  55. test_data['location'] = [[w2i[i] for i in str(x).split()] for x in test_data.location]
  56. test_data['text'] = [[w2i[i] for i in str(x).split()] for x in test_data.text]
  57. data_list=[]
  58. max_len=25
  59. for x in train_data.text:
  60. if len(x)>=max_len:
  61. x=x[0:max_len-1]
  62. while len(x)<max_len:
  63. x.append(0)
  64. data_list.append(x)
  65. eval_list=data_list[6000:]
  66. data_list=data_list[0:6000]
  67. eval_label=train_label[6000:]
  68. train_label=train_label[0:6000]
  69. print("datalabel len: ",len(data_list))
  70. print("evallabel len: ",len(eval_list))
  71. print("datalist len: ",len(data_list))
  72. print("evallist len: ",len(eval_list))
  73. print("datalen: ",len(data_list[0]))
  74. train_label=torch.tensor(train_label)
  75. eval_label=torch.tensor(eval_label)
  76. traindata_tensor = torch.Tensor(data_list)
  77. eval_tensor= torch.Tensor(eval_list)
  78. traindata_tensor = torch.Tensor(data_list)
  79. if USE_CUDA:
  80. print("using GPU")
  81. traindata_tensor =traindata_tensor.cuda()
  82. train_label = train_label.cuda()
  83. eval_tensor=eval_tensor.cuda()
  84. eval_label = eval_label.cuda()
  85. #print(a.shape)
  86. #train_data=torch.tensor(train_data[:read_train.shape[0]].values,dtype=torch.float)
  87. #print(train_data)
  88. #print (len(train_data.iloc[0]))
  89. '''
  90. 输入数据格式:
  91. input(seq_len, batch, input_size)
  92. h0(num_layers * num_directions, batch, hidden_size)
  93. c0(num_layers * num_directions, batch, hidden_size)
  94. 输出数据格式:
  95. output(seq_len, batch, hidden_size * num_directions)
  96. hn(num_layers * num_directions, batch, hidden_size)
  97. cn(num_layers * num_directions, batch, hidden_size)
  98. '''
  99. class LSTMTagger(nn.Module):
  100. def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
  101. super(LSTMTagger, self).__init__()
  102. self.hidden_dim = hidden_dim
  103. self.str_len=str_len
  104. self.batch_size=batch_size
  105. self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
  106. self.lstm = nn.LSTM(embedding_dim, hidden_dim)
  107. self.dropout1 = nn.Dropout(0.5)
  108. self.dense= nn.Linear(str_len*hidden_dim, hidden_dim)
  109. self.op_prelu=nn.PReLU()
  110. self.dropout2 = nn.Dropout(0.5)
  111. self.hidden2tag=nn.Linear(hidden_dim, tagset_size)
  112. self.op_tanh=nn.Tanh()
  113. self.hidden = self.init_hidden()
  114. def init_hidden(self):
  115. if USE_CUDA:
  116. return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
  117. else:
  118. return (torch.zeros(1, self.batch_size, self.hidden_dim),torch.zeros(1, self.batch_size, self.hidden_dim))
  119. def forward(self, sentence,state,train_flag):
  120. embeds = self.word_embeddings(sentence)
  121. #print(embeds.shape)
  122. self.hidden=state
  123. lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
  124. #print("ls",lstm_out.shape)
  125. if train_flag:
  126. lstm_out=self.dropout1(lstm_out)
  127. tag_space = self.dense(lstm_out.view(self.batch_size,-1))
  128. tag_space=self.op_prelu(tag_space)
  129. if train_flag:
  130. tag_space=self.dropout2(tag_space)
  131. tag_space=self.hidden2tag(tag_space)
  132. tag_space=self.op_tanh(tag_space)
  133. #print(tag_space.shape)
  134. #tag_scores = F.log_softmax(tag_space,dim=1)
  135. return tag_space ,self.hidden
  136. def out_put(net,batchsize):
  137. test_list = []
  138. max_len = 25
  139. i=0
  140. for x in test_data.text:
  141. if len(x) >= max_len:
  142. x = x[0:24]
  143. while len(x) < max_len:
  144. x.append(0)
  145. test_list.append(x)
  146. test_data_tensor = torch.Tensor(test_list)
  147. if USE_CUDA:
  148. print("using GPU to out")
  149. test_data_tensor = test_data_tensor.cuda()
  150. result=[]
  151. state = None
  152. with torch.no_grad():
  153. result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
  154. result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=batchsize, shuffle=False)
  155. index=0
  156. for X in result_dataloader:
  157. X=X[0]
  158. #print(type(X))
  159. #print(X.shape)
  160. if state is not None:
  161. if isinstance(state, tuple): # LSTM, state:(h, c)
  162. state = (state[0].detach(), state[1].detach())
  163. else:
  164. state = state.detach()
  165. if X.shape[0]!=batchsize:
  166. break
  167. X = X.long()
  168. (output, state) = model(X, state,False)
  169. _, predicted = torch.max(output.data, 1)
  170. #print("predicttype",type(predicted))
  171. #print(predicted)
  172. temp=predicted.cpu().numpy()
  173. for i in temp:
  174. result.append(i)
  175. print(len(result))
  176. while len(result)<3263:
  177. result.append(0)
  178. df_output = pd.DataFrame()
  179. aux = pd.read_csv(base_path + 'test.csv')
  180. df_output['id'] = aux['id']
  181. df_output['target'] = result
  182. df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
  183. print("reset the result csv")
  184. def eval(net,eval_data, eval_label,batch_size,pre):
  185. net.eval()
  186. print("enter",net.state_dict()["hidden2tag.bias"])
  187. dataset = torch.utils.data.TensorDataset(eval_data, eval_label)
  188. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
  189. total=0
  190. correct=0
  191. state=None
  192. with torch.no_grad():
  193. index = 0
  194. for X, y in train_iter:
  195. X = X.long()
  196. if X.size(0)!= batch_size:
  197. break
  198. if state is not None:
  199. if isinstance(state, tuple): # LSTM, state:(h, c)
  200. state = (state[0].detach(), state[1].detach())
  201. else:
  202. state = state.detach()
  203. (output, state) = model(X, state,False)
  204. _, predicted = torch.max(output.data, 1)
  205. total += X.size(0)
  206. correct += predicted.data.eq(y.data).cpu().sum()
  207. s = (((1.0*correct.numpy())/total))
  208. print("right",correct,"total",total,"Test Acc:",s)
  209. if s>pre:
  210. print("flush the result csv ")
  211. out_put(net,batch_size)
  212. return s
  213. def train(net, train_data, train_label,eval_tensor,eval_label,num_epochs, learning_rate, batch_size):
  214. net.train()
  215. state = None
  216. loss_fct = nn.CrossEntropyLoss()
  217. optimizer = optim.SGD(net.parameters(), lr=learning_rate,weight_decay=0)
  218. dataset = torch.utils.data.TensorDataset(train_data, train_label)
  219. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
  220. pre=0
  221. for epoch in range(num_epochs):
  222. correct = 0
  223. total=0
  224. iter = 0
  225. net.train()
  226. start = time.time()
  227. for X, y in train_iter:
  228. iter += 1
  229. X = X.long()
  230. if X.size(0)!= batch_size:
  231. break
  232. if state is not None:
  233. if isinstance (state, tuple): # LSTM, state:(h, c)
  234. state = (state[0].detach(), state[1].detach())
  235. else:
  236. state = state.detach()
  237. optimizer.zero_grad()
  238. #print(type(y))
  239. #print(y)
  240. (output, state) = net(X, state,True)
  241. loss = loss_fct(output.view(-1, 2), y.view(-1))
  242. _, predicted = torch.max(output.data, 1)
  243. loss.backward()
  244. optimizer.step()
  245. total += X.size(0)
  246. correct += predicted.data.eq(y.data).cpu().sum()
  247. s = ("Acc:%.3f" %((1.0*correct.numpy())/total))
  248. if iter %50==0:
  249. print("epoch ", str(epoch)," loss: ", loss.mean(),"right", correct, "total", total, "Train Acc:", s)
  250. torch.save(net.state_dict(), 'LSTMmodel.pth')
  251. print("before",net.state_dict()["hidden2tag.bias"])
  252. pre=eval(net,eval_tensor, eval_label,batch_size,pre)
  253. return
  254. #def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
  255. embedding_dim=300
  256. hidden_dim=256
  257. vocab_size=len(dict)
  258. target=2
  259. Batchsize=16
  260. stringlen=25
  261. Epoch=20
  262. lr=0.005
  263. model = LSTMTagger(embedding_dim, hidden_dim, vocab_size,target,Batchsize,stringlen)
  264. if USE_CUDA:
  265. model=model.cuda()
  266. print(model)
  267. """
  268. for param in net.parameters():
  269. nn.init.normal_(param, mean=0, std=0.01)
  270. """
  271. """
  272. for name,parameters in model.named_parameters():
  273. print(name,':',parameters.size())
  274. """
  275. train(model,traindata_tensor,train_label,eval_tensor,eval_label,Epoch,lr,Batchsize)

结果

本地训练

15epoch时达到峰值
image.png

在线跑分

虽然比不上bert
但是也比原来的好多了
image.png