赛题

https://www.kaggle.com/c/nlp-getting-started

解决方案

Bert Fine-tuning 使用base的预训练模型

基本思路

对于7600+条的训练数据,使用6000条做训练,1600+做测试
如果测试成绩达到最大值则刷新结果文件
设置为2分类问题
为了实现的简单,依旧只用text维度的数据

代码

  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MinMaxScaler
  4. import time
  5. import copy
  6. import torch.nn as nn
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. import torch.autograd as autograd
  10. import torch.nn.functional
  11. from torch.utils.data import Dataset, DataLoader
  12. from torchvision import transforms
  13. import warnings
  14. import torch
  15. import time
  16. import argparse
  17. import os
  18. #main.py
  19. from transformers import BertTokenizer
  20. from transformers import BertForSequenceClassification
  21. from transformers import BertConfig
  22. #from transformers import BertPreTrainedModel
  23. from transformers import BertModel
  24. import bert_
  25. USE_CUDA = torch.cuda.is_available()
  26. #USE_CUDA=False
  27. base_path="D:\\New_desktop\\nlp-getting-started\\"
  28. read_train=pd.read_csv(base_path+'train.csv')
  29. train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
  30. train_label=read_train.iloc[0:read_train.shape[0],[4]]
  31. train_label=[i for i in train_label.target]
  32. read_test = pd.read_csv(base_path + 'test.csv')
  33. test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
  34. tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
  35. train_data['text'] = [(tokenizer.encode(x, add_special_tokens=False)) for x in train_data.text]
  36. #input_ids = torch.tensor(tokenizer.encode("美国", add_special_tokens=True)).unsqueeze(0) # Batch size 1
  37. data_list=[]
  38. max_len=40
  39. '''
  40. for i in range(0, len(train_data)):
  41. if len(train_data.iloc[i]['text'])>max_len:
  42. max_len=len(train_data.iloc[i]['text'])
  43. print("max_strlen: ",max_len)
  44. '''
  45. for x in train_data.text:
  46. if len(x)>=max_len:
  47. x=x[0:39]
  48. while len(x)<max_len:
  49. x.append(0)
  50. x.append(102)
  51. l = x
  52. x = [101]
  53. x.extend(l)
  54. #x=torch.tensor(x)
  55. data_list.append(x)
  56. eval_list=data_list[6000:]
  57. data_list=data_list[0:6000]
  58. eval_label=train_label[6000:]
  59. train_label=train_label[0:6000]
  60. print("datalabel len: ",len(data_list))
  61. print("evallabel len: ",len(eval_list))
  62. print("datalist len: ",len(data_list))
  63. print("evallist len: ",len(eval_list))
  64. print("datalen: ",len(data_list[0]))
  65. train_label=torch.tensor(train_label)
  66. eval_label=torch.tensor(eval_label)
  67. traindata_tensor = torch.Tensor(data_list)
  68. eval_tensor= torch.Tensor(eval_list)
  69. #USE_CUDA=False
  70. if USE_CUDA:
  71. print("using GPU")
  72. traindata_tensor =traindata_tensor.cuda()
  73. train_label = train_label.cuda()
  74. eval_tensor=eval_tensor.cuda()
  75. eval_label = eval_label.cuda()
  76. def get_train_args():
  77. parser=argparse.ArgumentParser()
  78. parser.add_argument('--batch_size',type=int,default=32,help = '每批数据的数量')
  79. parser.add_argument('--nepoch',type=int,default=30,help = '训练的轮次')
  80. parser.add_argument('--lr',type=float,default=0.001,help = '学习率')
  81. parser.add_argument('--gpu',type=bool,default=True,help = '是否使用gpu')
  82. parser.add_argument('--num_workers',type=int,default=2,help='dataloader使用的线程数量')
  83. parser.add_argument('--num_labels',type=int,default=2,help='分类类数')
  84. parser.add_argument('--data_path',type=str,default='./data',help='数据路径')
  85. opt=parser.parse_args()
  86. print(opt)
  87. return opt
  88. def get_model(opt):
  89. model = BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=opt.num_labels)
  90. #model = bert_.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=opt.num_labels)
  91. #model = bert_LSTM.Net()
  92. return model
  93. def out_put(net,batchsize):
  94. test_list = []
  95. test_data['text'] = [(tokenizer.encode(x, add_special_tokens=False)) for x in test_data.text]
  96. max_len = 40
  97. i=0
  98. for x in test_data.text:
  99. i+=1
  100. if len(x) >= max_len:
  101. x = x[0:39]
  102. while len(x) < max_len:
  103. x.append(0)
  104. x.append(102)
  105. l = x
  106. x = [101]
  107. x.extend(l)
  108. # x=torch.tensor(x)
  109. test_list.append(x)
  110. print(i)
  111. test_data_tensor = torch.Tensor(test_list)
  112. if USE_CUDA:
  113. print("using GPU to out")
  114. test_data_tensor = test_data_tensor.cuda()
  115. result=[]
  116. with torch.no_grad():
  117. result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
  118. result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=batchsize, shuffle=False)
  119. index=0
  120. for X in result_dataloader:
  121. X=X[0]
  122. #print(type(X))
  123. #print(X.shape)
  124. if X.shape[0]!=batchsize:
  125. break
  126. X = X.long()
  127. outputs = net(X)
  128. logits = outputs[:2]
  129. _, predicted = torch.max(logits[0].data, 1)
  130. #print("predicttype",type(predicted))
  131. #print(predicted)
  132. for i in range(len(predicted)):
  133. result.append(predicted[i])
  134. print(len(result))
  135. while len(result)<3263:
  136. result.append(0)
  137. df_output = pd.DataFrame()
  138. aux = pd.read_csv(base_path + 'test.csv')
  139. df_output['id'] = aux['id']
  140. df_output['target'] = result
  141. df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
  142. print("reset the result csv")
  143. def eval(net,eval_data, eval_label,batch_size,pre):
  144. net.eval()
  145. print("enter",net.state_dict()["classifier.bias"])
  146. dataset = torch.utils.data.TensorDataset(eval_data, eval_label)
  147. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
  148. total=0
  149. correct=0
  150. with torch.no_grad():
  151. index = 0
  152. for X, y in train_iter:
  153. X = X.long()
  154. if X.size(0)!= batch_size:
  155. break
  156. outputs= net(X, labels=y)
  157. #print("enter2", net.state_dict()["classifier.bias"])
  158. loss, logits = outputs[:2]
  159. _, predicted = torch.max(logits.data, 1)
  160. total += X.size(0)
  161. correct += predicted.data.eq(y.data).cpu().sum()
  162. s = (((1.0*correct.numpy())/total))
  163. print("right",correct,"total",total,"Acc:",s)
  164. if s>pre:
  165. print("save ")
  166. out_put(net,batch_size)
  167. return s
  168. def train(net, train_data, train_label,eval_tensor,eval_label,num_epochs, learning_rate, batch_size):
  169. net.train()
  170. optimizer = optim.SGD(net.parameters(), lr=learning_rate,weight_decay=0)
  171. dataset = torch.utils.data.TensorDataset(train_data, train_label)
  172. train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
  173. pre=0
  174. for epoch in range(num_epochs):
  175. correct = 0
  176. total=0
  177. iter = 0
  178. for X, y in train_iter:
  179. iter += 1
  180. X = X.long()
  181. if X.size(0)!= batch_size:
  182. break
  183. optimizer.zero_grad()
  184. #print(type(y))
  185. #print(y)
  186. outputs= net(X, labels=y)
  187. loss, logits = outputs[0],outputs[1]
  188. _, predicted = torch.max(logits.data, 1)
  189. #print("predicted",predicted)
  190. #print("answer", y)
  191. loss.backward()
  192. optimizer.step()
  193. #print(outputs[1].shape)
  194. #print(output)
  195. #print(outputs[1])
  196. total += X.size(0)
  197. correct += predicted.data.eq(y.data).cpu().sum()
  198. s = ("Acc:%.3f" %((1.0*correct.numpy())/total))
  199. if iter %50==0:
  200. print("epoch ", str(epoch)," loss: ", loss.mean(),"right", correct, "total", total, "Acc:", s)
  201. torch.save(net.state_dict(), 'model.pth')
  202. print("before",net.state_dict()["classifier.bias"])
  203. pre=eval(net,eval_tensor, eval_label,batch_size,pre)
  204. return
  205. opt = get_train_args()
  206. model=get_model(opt)
  207. if USE_CUDA:
  208. model=model.cuda()
  209. #model.load_state_dict(torch.load('model.pkl'))
  210. """
  211. for name,parameters in model.named_parameters():
  212. print(name,':',parameters.size())
  213. """
  214. train(model,traindata_tensor,train_label,eval_tensor,eval_label,30,0.001,16)
  215. #model = torch.load('model.pkl')

结果

本地运行

训练的结果还不错,lr=0.001,batch_size=16,很快就得到收敛
大约在82%的正确率的时候拟合

image.png

image.png

在线跑分

拿到了0.82分
排名 861 . Top29%

image.png