比赛题

https://www.kaggle.com/c/nlp-getting-started
简单的说就是句子的二分类问题

基本思路

  1. 建立词典
  2. 输入句子序列(编号)
  3. 输入到embeding
  4. embeding再输入LSTM
  5. LSTM输入到Linear
  6. 最后利用输出的数和1,0计算损失
  7. 使用torch.nn.MSELoss()计算loss
  8. 优化算法optim.SGD(model.parameters(), lr=learning_rate)

总的架构:Embedding+LSTM+Linear

代码

  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MinMaxScaler
  4. import time
  5. import copy
  6. import torch.nn as nn
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. import torch.autograd as autograd
  10. import torch.nn.functional
  11. from torch.utils.data import Dataset, DataLoader
  12. from torchvision import transforms
  13. import warnings
  14. import torch
  15. import time
  16. USE_CUDA = torch.cuda.is_available()
  17. base_path="D:\\New_desktop\\nlp-getting-started\\"
  18. read_train=pd.read_csv(base_path+'train.csv')
  19. train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
  20. train_label=read_train.iloc[0:read_train.shape[0],[4]]
  21. train_label=torch.tensor(train_label[:read_train.shape[0]].values,dtype=torch.float)
  22. read_test = pd.read_csv(base_path + 'test.csv')
  23. test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
  24. a=[]
  25. sentence=""
  26. for i in range(0, len(train_data)):
  27. sentence=sentence+str(train_data.iloc[i]['keyword'])
  28. sentence+=" "
  29. sentence=sentence+str(train_data.iloc[i]['location'])
  30. sentence += " "
  31. sentence=sentence+str(train_data.iloc[i]['text'])
  32. sentence += " "
  33. for i in range(0, len(test_data)):
  34. sentence=sentence+str(test_data.iloc[i]['keyword'])
  35. sentence+=" "
  36. sentence=sentence+str(test_data.iloc[i]['location'])
  37. sentence += " "
  38. sentence=sentence+str(test_data.iloc[i]['text'])
  39. sentence += " "
  40. dict = sentence.split()
  41. dict=set(dict)
  42. print(len(dict))
  43. w2i={}
  44. def word2index():
  45. index=0
  46. for i in dict:
  47. w2i[i]=index
  48. index+=1
  49. word2index()
  50. train_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in train_data.keyword]
  51. train_data['location'] = [[w2i[i] for i in str(x).split()] for x in train_data.location]
  52. train_data['text'] = [[w2i[i] for i in str(x).split()] for x in train_data.text]
  53. #print(train_data[:read_train.shape[0]])
  54. test_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in test_data.keyword]
  55. test_data['location'] = [[w2i[i] for i in str(x).split()] for x in test_data.location]
  56. test_data['text'] = [[w2i[i] for i in str(x).split()] for x in test_data.text]
  57. data_list=[]
  58. max_len=0
  59. for i in range(0, len(train_data)):
  60. if len(train_data.iloc[i]['text'])>max_len:
  61. max_len=len(train_data.iloc[i]['text'])
  62. print(max_len)
  63. for x in train_data.text:
  64. while len(x)<max_len:
  65. x.append(0)
  66. for i in range(0, len(train_data)):
  67. data_list.append(train_data.iloc[i]['text'])
  68. #print(len(data_list))
  69. traindata_tensor = torch.Tensor(data_list)
  70. if USE_CUDA:
  71. print("using GPU")
  72. traindata_tensor =traindata_tensor.cuda()
  73. train_label = train_label.cuda()
  74. #print(a.shape)
  75. #train_data=torch.tensor(train_data[:read_train.shape[0]].values,dtype=torch.float)
  76. #print(train_data)
  77. #print (len(train_data.iloc[0]))
  78. '''
  79. 输入数据格式:
  80. input(seq_len, batch, input_size)
  81. h0(num_layers * num_directions, batch, hidden_size)
  82. c0(num_layers * num_directions, batch, hidden_size)
  83. 输出数据格式:
  84. output(seq_len, batch, hidden_size * num_directions)
  85. hn(num_layers * num_directions, batch, hidden_size)
  86. cn(num_layers * num_directions, batch, hidden_size)
  87. '''
  88. class LSTMTagger(nn.Module):
  89. def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
  90. super(LSTMTagger, self).__init__()
  91. self.hidden_dim = hidden_dim
  92. self.str_len=str_len
  93. self.batch_size=batch_size
  94. self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
  95. self.lstm = nn.LSTM(embedding_dim, hidden_dim)
  96. self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
  97. self.hidden = self.init_hidden()
  98. def init_hidden(self):
  99. return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
  100. torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
  101. def forward(self, sentence,state):
  102. embeds = self.word_embeddings(sentence)
  103. #print(embeds.shape)
  104. self.hidden=state
  105. lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
  106. #print("ls",lstm_out.shape)
  107. tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
  108. #print(tag_space.shape)
  109. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  110. #tag_scores = F.log_softmax(tag_space,dim=1)
  111. return tag_space ,self.hidden
  112. model = LSTMTagger(10, 100, len(dict),1,16,31)
  113. model=model.cuda()
  114. def train(net, train_data, train_label,num_epochs, learning_rate, batch_size):
  115. train_ls=[]
  116. #loss =torch.nn.CrossEntropyLoss()
  117. state=None
  118. loss = torch.nn.MSELoss()
  119. optimizer = optim.SGD(model.parameters(), lr=learning_rate)
  120. dataset = torch.utils.data.TensorDataset(train_data, train_label)
  121. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
  122. iter=0
  123. for epoch in range(num_epochs):
  124. correct = 0
  125. total=0
  126. start = time.time()
  127. index=0
  128. for X, y in train_iter:
  129. iter+=1
  130. index+=1
  131. if index>=470:
  132. break
  133. if state is not None:
  134. if isinstance (state, tuple): # LSTM, state:(h, c)
  135. state = (state[0].detach(), state[1].detach())
  136. else:
  137. state = state.detach()
  138. X = X.long()
  139. (output, state) = model(X, state)
  140. #correct+=predicted.data.eq(label.data).cpu().sum()
  141. l = loss(output.float(), y)
  142. l.backward()
  143. optimizer.step()
  144. optimizer.zero_grad()
  145. #print(output)
  146. output = [1 if i > 0.5 else 0 for i in output]
  147. for i in range(len(output)):
  148. total+=1
  149. if output[i]==y[i]:
  150. correct+=1
  151. acc = correct / total
  152. end = time.time()
  153. print("epoch ", str(epoch), "time: ",end-start ," loss: ", l.item())
  154. print("correct: ", correct, " total: ", total, "acc: ", acc)
  155. print("\n")
  156. train_ls.append(l.item())
  157. return train_ls
  158. #torch.save(model_object, 'model.pkl')
  159. #model = torch.load('model.pkl')
  160. def out_put(model):
  161. test_list = []
  162. max_len = 31
  163. for x in test_data.text:
  164. while len(x) < max_len:
  165. x.append(0)
  166. if len(x)>max_len:
  167. x =x[0:30]
  168. test_num=len(test_data.text)
  169. print("sda",test_num)
  170. for i in range(0, len(test_data)):
  171. data_list.append(test_data.iloc[i]['text'])
  172. test_data_tensor = torch.Tensor(data_list)
  173. if USE_CUDA:
  174. print("using GPU")
  175. test_data_tensor = test_data_tensor.cuda()
  176. with torch.no_grad():
  177. result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
  178. result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=16, shuffle=False)
  179. state=None
  180. index=0
  181. for X in result_dataloader:
  182. X=X[0]
  183. if index>=203:
  184. break
  185. index+=1
  186. print(X.shape)
  187. if state is not None:
  188. if isinstance(state, tuple): # LSTM, state:(h, c)
  189. state = (state[0].detach(), state[1].detach())
  190. else:
  191. state = state.detach()
  192. X = X.long()
  193. (output, state) = model(X, state)
  194. # correct+=predicted.data.eq(label.data).cpu().sum()
  195. output = [1 if i > 0.5 else 0 for i in output]
  196. for i in range(len(output)):
  197. test_list.append(output[i])
  198. print(test_list)
  199. print(len(test_list))
  200. while len(test_list)<3263:
  201. test_list.append(0)
  202. df_output = pd.DataFrame()
  203. aux = pd.read_csv(base_path + 'test.csv')
  204. df_output['id'] = aux['id']
  205. df_output['target'] = test_list
  206. df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
  207. train(model,traindata_tensor,train_label,250,0.1,16)
  208. out_put(model)

结果

本地训练集

image.png

在线跑分

只有5分多
排名93%
效果奇差

image.png

image.png

总结

  1. 由于编程水平的限制,没有用更好的损失函数和优化算法
  2. 由于模型的输入被限制为了定长的batch_size的句子链,导致了部分的训练数据不可用以及部分的测试数据只能随便填(输入batchsize=16,只能取到16的倍数)
  3. 由于数据较少以及全用于训练,以至于本地的测试出现了过拟合的现象(本地最高训练准确率达到了98%,但实际显然在在线跑分上差到家了)
  4. 由于第二点的限制,导致一开始出现了最后一次读取的数据不足16的现象,导致了输入的维度不对,导致了Bug,但是经验有限,一直没看出来,尤其是使用GPU进行运行的时候,报错都特别的诡异,建议之后遇到bug可以关闭GPU再进行分析
  5. 还有一个问题是没有充分的利用信息,我只用了text这个维度进行预测

    改进

    减小训练

    初步随手设置15epoch,减少过拟合
    得分增加
    image.png

Drop_out

加上dropout
成绩有所进步

  1. class LSTMTagger(nn.Module):
  2. def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
  3. super(LSTMTagger, self).__init__()
  4. self.hidden_dim = hidden_dim
  5. self.str_len=str_len
  6. self.batch_size=batch_size
  7. self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
  8. self.lstm = nn.LSTM(embedding_dim, hidden_dim)
  9. self.dropout = nn.Dropout(0.5)
  10. self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
  11. self.hidden = self.init_hidden()
  12. def init_hidden(self):
  13. return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
  14. torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
  15. def forward(self, sentence,state,train_flag):
  16. embeds = self.word_embeddings(sentence)
  17. #print(embeds.shape)
  18. self.hidden=state
  19. lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
  20. #print("ls",lstm_out.shape)
  21. if train_flag:
  22. lstm_out=self.dropout(lstm_out)
  23. tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
  24. #print(tag_space.shape)
  25. #tag_scores = F.log_softmax(tag_space,dim=1)
  26. return tag_space ,self.hidden

image.png

增加embedding维数和hidden_dim

懒的提交测试了。。。

  1. model = LSTMTagger(300, 256, len(dict),1,16,31)

最终代码(不改了)

image.png

  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MinMaxScaler
  4. import time
  5. import copy
  6. import torch.nn as nn
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. import torch.autograd as autograd
  10. import torch.nn.functional
  11. from torch.utils.data import Dataset, DataLoader
  12. from torchvision import transforms
  13. import warnings
  14. import torch
  15. import time
  16. USE_CUDA = torch.cuda.is_available()
  17. base_path="D:\\New_desktop\\nlp-getting-started\\"
  18. read_train=pd.read_csv(base_path+'train.csv')
  19. train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
  20. train_label=read_train.iloc[0:read_train.shape[0],[4]]
  21. train_label=torch.tensor(train_label[:read_train.shape[0]].values,dtype=torch.float)
  22. read_test = pd.read_csv(base_path + 'test.csv')
  23. test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
  24. a=[]
  25. sentence=""
  26. for i in range(0, len(train_data)):
  27. sentence=sentence+str(train_data.iloc[i]['keyword'])
  28. sentence+=" "
  29. sentence=sentence+str(train_data.iloc[i]['location'])
  30. sentence += " "
  31. sentence=sentence+str(train_data.iloc[i]['text'])
  32. sentence += " "
  33. for i in range(0, len(test_data)):
  34. sentence=sentence+str(test_data.iloc[i]['keyword'])
  35. sentence+=" "
  36. sentence=sentence+str(test_data.iloc[i]['location'])
  37. sentence += " "
  38. sentence=sentence+str(test_data.iloc[i]['text'])
  39. sentence += " "
  40. dict = sentence.split()
  41. dict=set(dict)
  42. print(len(dict))
  43. w2i={}
  44. def word2index():
  45. index=0
  46. for i in dict:
  47. w2i[i]=index
  48. index+=1
  49. word2index()
  50. train_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in train_data.keyword]
  51. train_data['location'] = [[w2i[i] for i in str(x).split()] for x in train_data.location]
  52. train_data['text'] = [[w2i[i] for i in str(x).split()] for x in train_data.text]
  53. #print(train_data[:read_train.shape[0]])
  54. test_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in test_data.keyword]
  55. test_data['location'] = [[w2i[i] for i in str(x).split()] for x in test_data.location]
  56. test_data['text'] = [[w2i[i] for i in str(x).split()] for x in test_data.text]
  57. data_list=[]
  58. max_len=0
  59. for i in range(0, len(train_data)):
  60. if len(train_data.iloc[i]['text'])>max_len:
  61. max_len=len(train_data.iloc[i]['text'])
  62. print(max_len)
  63. for x in train_data.text:
  64. while len(x)<max_len:
  65. x.append(0)
  66. for i in range(0, len(train_data)):
  67. data_list.append(train_data.iloc[i]['text'])
  68. #print(len(data_list))
  69. traindata_tensor = torch.Tensor(data_list)
  70. if USE_CUDA:
  71. print("using GPU")
  72. traindata_tensor =traindata_tensor.cuda()
  73. train_label = train_label.cuda()
  74. #print(a.shape)
  75. #train_data=torch.tensor(train_data[:read_train.shape[0]].values,dtype=torch.float)
  76. #print(train_data)
  77. #print (len(train_data.iloc[0]))
  78. '''
  79. 输入数据格式:
  80. input(seq_len, batch, input_size)
  81. h0(num_layers * num_directions, batch, hidden_size)
  82. c0(num_layers * num_directions, batch, hidden_size)
  83. 输出数据格式:
  84. output(seq_len, batch, hidden_size * num_directions)
  85. hn(num_layers * num_directions, batch, hidden_size)
  86. cn(num_layers * num_directions, batch, hidden_size)
  87. '''
  88. class LSTMTagger(nn.Module):
  89. def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
  90. super(LSTMTagger, self).__init__()
  91. self.hidden_dim = hidden_dim
  92. self.str_len=str_len
  93. self.batch_size=batch_size
  94. self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
  95. self.lstm = nn.LSTM(embedding_dim, hidden_dim)
  96. self.dropout = nn.Dropout(0.5)
  97. self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
  98. self.hidden = self.init_hidden()
  99. def init_hidden(self):
  100. return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
  101. torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
  102. def forward(self, sentence,state,train_flag):
  103. embeds = self.word_embeddings(sentence)
  104. #print(embeds.shape)
  105. self.hidden=state
  106. lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
  107. #print("ls",lstm_out.shape)
  108. if train_flag:
  109. lstm_out=self.dropout(lstm_out)
  110. tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
  111. #print(tag_space.shape)
  112. #tag_scores = F.log_softmax(tag_space,dim=1)
  113. return tag_space ,self.hidden
  114. model = LSTMTagger(300, 256, len(dict),1,16,31)
  115. model=model.cuda()
  116. print(model)
  117. exit()
  118. def train(model, train_data, train_label,num_epochs, learning_rate, batch_size):
  119. train_ls=[]
  120. #loss =torch.nn.CrossEntropyLoss()
  121. state=None
  122. loss = torch.nn.MSELoss()
  123. optimizer = optim.SGD(model.parameters(), lr=learning_rate,weight_decay=0)
  124. dataset = torch.utils.data.TensorDataset(train_data, train_label)
  125. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
  126. iter=0
  127. for epoch in range(num_epochs):
  128. correct = 0
  129. total=0
  130. start = time.time()
  131. index=0
  132. for X, y in train_iter:
  133. iter+=1
  134. index+=1
  135. if index>400:
  136. break
  137. if state is not None:
  138. if isinstance (state, tuple): # LSTM, state:(h, c)
  139. state = (state[0].detach(), state[1].detach())
  140. else:
  141. state = state.detach()
  142. X = X.long()
  143. (output, state) = model(X, state,True)
  144. #correct+=predicted.data.eq(label.data).cpu().sum()
  145. l = loss(output.float(), y)
  146. l.backward()
  147. optimizer.step()
  148. optimizer.zero_grad()
  149. #print(output)
  150. output = [1 if i > 0.5 else 0 for i in output]
  151. for i in range(len(output)):
  152. total+=1
  153. if output[i]==y[i]:
  154. correct+=1
  155. acc = correct / total
  156. end = time.time()
  157. print("epoch ", str(epoch), "time: ",end-start ," loss: ", l.item(),"correct: ", correct, " total: ", total, "acc: ", acc)
  158. torch.save(model, 'model.pkl')
  159. eval(model, train_data, train_label, batch_size)
  160. train_ls.append(l.item())
  161. return train_ls
  162. Train=train_data
  163. #model = torch.load('model.pkl')
  164. def eval(model,train_data, train_label,batch_size):
  165. test_list=[]
  166. dataset = torch.utils.data.TensorDataset(train_data, train_label)
  167. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
  168. with torch.no_grad():
  169. state = None
  170. index = 0
  171. for X, y in train_iter:
  172. index += 1
  173. if index <=400:
  174. continue
  175. if index >= 470:
  176. break
  177. if state is not None:
  178. if isinstance(state, tuple): # LSTM, state:(h, c)
  179. state = (state[0].detach(), state[1].detach())
  180. else:
  181. state = state.detach()
  182. X = X.long()
  183. (output, state) = model(X, state,False)
  184. output = [1 if i > 0.5 else 0 for i in output]
  185. for i in range(len(output)):
  186. test_list.append(output[i])
  187. correct=0
  188. for i in range(0, len(test_list)):
  189. if train_label[i]==test_list[i]:
  190. correct+=1
  191. print("eval: ",correct/len(test_list))
  192. def out_put(model):
  193. test_list = []
  194. max_len = 31
  195. for x in test_data.text:
  196. while len(x) < max_len:
  197. x.append(0)
  198. if len(x)>max_len:
  199. x =x[0:30]
  200. test_num=len(test_data.text)
  201. #print("sda",test_num)
  202. for i in range(0, len(test_data)):
  203. data_list.append(test_data.iloc[i]['text'])
  204. test_data_tensor = torch.Tensor(data_list)
  205. if USE_CUDA:
  206. print("using GPU")
  207. test_data_tensor = test_data_tensor.cuda()
  208. with torch.no_grad():
  209. result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
  210. result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=16, shuffle=False)
  211. state=None
  212. index=0
  213. for X in result_dataloader:
  214. X=X[0]
  215. if index>=203:
  216. break
  217. index+=1
  218. if state is not None:
  219. if isinstance(state, tuple): # LSTM, state:(h, c)
  220. state = (state[0].detach(), state[1].detach())
  221. else:
  222. state = state.detach()
  223. X = X.long()
  224. (output, state) = model(X, state,False)
  225. # correct+=predicted.data.eq(label.data).cpu().sum()
  226. output = [1 if i > 0.5 else 0 for i in output]
  227. for i in range(len(output)):
  228. test_list.append(output[i])
  229. print(len(test_list))
  230. while len(test_list)<3263:
  231. test_list.append(0)
  232. df_output = pd.DataFrame()
  233. aux = pd.read_csv(base_path + 'test.csv')
  234. df_output['id'] = aux['id']
  235. df_output['target'] = test_list
  236. df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
  237. train(model,traindata_tensor,train_label,30,0.01,16)
  238. print("\n")
  239. eval(model,traindata_tensor, train_label,16)
  240. out_put(model)