比赛题
https://www.kaggle.com/c/nlp-getting-started
简单的说就是句子的二分类问题
基本思路
- 建立词典
- 输入句子序列(编号)
- 输入到embeding
- embeding再输入LSTM
- LSTM输入到Linear
- 最后利用输出的数和1,0计算损失
- 使用torch.nn.MSELoss()计算loss
- 优化算法optim.SGD(model.parameters(), lr=learning_rate)
总的架构:Embedding+LSTM+Linear
代码
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time
import copy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import warnings
import torch
import time
USE_CUDA = torch.cuda.is_available()
base_path="D:\\New_desktop\\nlp-getting-started\\"
read_train=pd.read_csv(base_path+'train.csv')
train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
train_label=read_train.iloc[0:read_train.shape[0],[4]]
train_label=torch.tensor(train_label[:read_train.shape[0]].values,dtype=torch.float)
read_test = pd.read_csv(base_path + 'test.csv')
test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
a=[]
sentence=""
for i in range(0, len(train_data)):
sentence=sentence+str(train_data.iloc[i]['keyword'])
sentence+=" "
sentence=sentence+str(train_data.iloc[i]['location'])
sentence += " "
sentence=sentence+str(train_data.iloc[i]['text'])
sentence += " "
for i in range(0, len(test_data)):
sentence=sentence+str(test_data.iloc[i]['keyword'])
sentence+=" "
sentence=sentence+str(test_data.iloc[i]['location'])
sentence += " "
sentence=sentence+str(test_data.iloc[i]['text'])
sentence += " "
dict = sentence.split()
dict=set(dict)
print(len(dict))
w2i={}
def word2index():
index=0
for i in dict:
w2i[i]=index
index+=1
word2index()
train_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in train_data.keyword]
train_data['location'] = [[w2i[i] for i in str(x).split()] for x in train_data.location]
train_data['text'] = [[w2i[i] for i in str(x).split()] for x in train_data.text]
#print(train_data[:read_train.shape[0]])
test_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in test_data.keyword]
test_data['location'] = [[w2i[i] for i in str(x).split()] for x in test_data.location]
test_data['text'] = [[w2i[i] for i in str(x).split()] for x in test_data.text]
data_list=[]
max_len=0
for i in range(0, len(train_data)):
if len(train_data.iloc[i]['text'])>max_len:
max_len=len(train_data.iloc[i]['text'])
print(max_len)
for x in train_data.text:
while len(x)<max_len:
x.append(0)
for i in range(0, len(train_data)):
data_list.append(train_data.iloc[i]['text'])
#print(len(data_list))
traindata_tensor = torch.Tensor(data_list)
if USE_CUDA:
print("using GPU")
traindata_tensor =traindata_tensor.cuda()
train_label = train_label.cuda()
#print(a.shape)
#train_data=torch.tensor(train_data[:read_train.shape[0]].values,dtype=torch.float)
#print(train_data)
#print (len(train_data.iloc[0]))
'''
输入数据格式:
input(seq_len, batch, input_size)
h0(num_layers * num_directions, batch, hidden_size)
c0(num_layers * num_directions, batch, hidden_size)
输出数据格式:
output(seq_len, batch, hidden_size * num_directions)
hn(num_layers * num_directions, batch, hidden_size)
cn(num_layers * num_directions, batch, hidden_size)
'''
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.str_len=str_len
self.batch_size=batch_size
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
def forward(self, sentence,state):
embeds = self.word_embeddings(sentence)
#print(embeds.shape)
self.hidden=state
lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
#print("ls",lstm_out.shape)
tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
#print(tag_space.shape)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
#tag_scores = F.log_softmax(tag_space,dim=1)
return tag_space ,self.hidden
model = LSTMTagger(10, 100, len(dict),1,16,31)
model=model.cuda()
def train(net, train_data, train_label,num_epochs, learning_rate, batch_size):
train_ls=[]
#loss =torch.nn.CrossEntropyLoss()
state=None
loss = torch.nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
iter=0
for epoch in range(num_epochs):
correct = 0
total=0
start = time.time()
index=0
for X, y in train_iter:
iter+=1
index+=1
if index>=470:
break
if state is not None:
if isinstance (state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
X = X.long()
(output, state) = model(X, state)
#correct+=predicted.data.eq(label.data).cpu().sum()
l = loss(output.float(), y)
l.backward()
optimizer.step()
optimizer.zero_grad()
#print(output)
output = [1 if i > 0.5 else 0 for i in output]
for i in range(len(output)):
total+=1
if output[i]==y[i]:
correct+=1
acc = correct / total
end = time.time()
print("epoch ", str(epoch), "time: ",end-start ," loss: ", l.item())
print("correct: ", correct, " total: ", total, "acc: ", acc)
print("\n")
train_ls.append(l.item())
return train_ls
#torch.save(model_object, 'model.pkl')
#model = torch.load('model.pkl')
def out_put(model):
test_list = []
max_len = 31
for x in test_data.text:
while len(x) < max_len:
x.append(0)
if len(x)>max_len:
x =x[0:30]
test_num=len(test_data.text)
print("sda",test_num)
for i in range(0, len(test_data)):
data_list.append(test_data.iloc[i]['text'])
test_data_tensor = torch.Tensor(data_list)
if USE_CUDA:
print("using GPU")
test_data_tensor = test_data_tensor.cuda()
with torch.no_grad():
result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=16, shuffle=False)
state=None
index=0
for X in result_dataloader:
X=X[0]
if index>=203:
break
index+=1
print(X.shape)
if state is not None:
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
X = X.long()
(output, state) = model(X, state)
# correct+=predicted.data.eq(label.data).cpu().sum()
output = [1 if i > 0.5 else 0 for i in output]
for i in range(len(output)):
test_list.append(output[i])
print(test_list)
print(len(test_list))
while len(test_list)<3263:
test_list.append(0)
df_output = pd.DataFrame()
aux = pd.read_csv(base_path + 'test.csv')
df_output['id'] = aux['id']
df_output['target'] = test_list
df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
train(model,traindata_tensor,train_label,250,0.1,16)
out_put(model)
结果
本地训练集
在线跑分
只有5分多
排名93%
效果奇差
总结
- 由于编程水平的限制,没有用更好的损失函数和优化算法
- 由于模型的输入被限制为了定长的batch_size的句子链,导致了部分的训练数据不可用以及部分的测试数据只能随便填(输入batchsize=16,只能取到16的倍数)
- 由于数据较少以及全用于训练,以至于本地的测试出现了过拟合的现象(本地最高训练准确率达到了98%,但实际显然在在线跑分上差到家了)
- 由于第二点的限制,导致一开始出现了最后一次读取的数据不足16的现象,导致了输入的维度不对,导致了Bug,但是经验有限,一直没看出来,尤其是使用GPU进行运行的时候,报错都特别的诡异,建议之后遇到bug可以关闭GPU再进行分析
- 还有一个问题是没有充分的利用信息,我只用了text这个维度进行预测
改进
减小训练
初步随手设置15epoch,减少过拟合
得分增加
Drop_out
加上dropout
成绩有所进步
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.str_len=str_len
self.batch_size=batch_size
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.dropout = nn.Dropout(0.5)
self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
def forward(self, sentence,state,train_flag):
embeds = self.word_embeddings(sentence)
#print(embeds.shape)
self.hidden=state
lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
#print("ls",lstm_out.shape)
if train_flag:
lstm_out=self.dropout(lstm_out)
tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
#print(tag_space.shape)
#tag_scores = F.log_softmax(tag_space,dim=1)
return tag_space ,self.hidden
增加embedding维数和hidden_dim
懒的提交测试了。。。
model = LSTMTagger(300, 256, len(dict),1,16,31)
最终代码(不改了)
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time
import copy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import warnings
import torch
import time
USE_CUDA = torch.cuda.is_available()
base_path="D:\\New_desktop\\nlp-getting-started\\"
read_train=pd.read_csv(base_path+'train.csv')
train_data=read_train.iloc[0:read_train.shape[0],[1,2,3]]
train_label=read_train.iloc[0:read_train.shape[0],[4]]
train_label=torch.tensor(train_label[:read_train.shape[0]].values,dtype=torch.float)
read_test = pd.read_csv(base_path + 'test.csv')
test_data = read_test.iloc[0:read_train.shape[0], [1, 2, 3]]
a=[]
sentence=""
for i in range(0, len(train_data)):
sentence=sentence+str(train_data.iloc[i]['keyword'])
sentence+=" "
sentence=sentence+str(train_data.iloc[i]['location'])
sentence += " "
sentence=sentence+str(train_data.iloc[i]['text'])
sentence += " "
for i in range(0, len(test_data)):
sentence=sentence+str(test_data.iloc[i]['keyword'])
sentence+=" "
sentence=sentence+str(test_data.iloc[i]['location'])
sentence += " "
sentence=sentence+str(test_data.iloc[i]['text'])
sentence += " "
dict = sentence.split()
dict=set(dict)
print(len(dict))
w2i={}
def word2index():
index=0
for i in dict:
w2i[i]=index
index+=1
word2index()
train_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in train_data.keyword]
train_data['location'] = [[w2i[i] for i in str(x).split()] for x in train_data.location]
train_data['text'] = [[w2i[i] for i in str(x).split()] for x in train_data.text]
#print(train_data[:read_train.shape[0]])
test_data['keyword'] = [[w2i[i] for i in str(x).split()] for x in test_data.keyword]
test_data['location'] = [[w2i[i] for i in str(x).split()] for x in test_data.location]
test_data['text'] = [[w2i[i] for i in str(x).split()] for x in test_data.text]
data_list=[]
max_len=0
for i in range(0, len(train_data)):
if len(train_data.iloc[i]['text'])>max_len:
max_len=len(train_data.iloc[i]['text'])
print(max_len)
for x in train_data.text:
while len(x)<max_len:
x.append(0)
for i in range(0, len(train_data)):
data_list.append(train_data.iloc[i]['text'])
#print(len(data_list))
traindata_tensor = torch.Tensor(data_list)
if USE_CUDA:
print("using GPU")
traindata_tensor =traindata_tensor.cuda()
train_label = train_label.cuda()
#print(a.shape)
#train_data=torch.tensor(train_data[:read_train.shape[0]].values,dtype=torch.float)
#print(train_data)
#print (len(train_data.iloc[0]))
'''
输入数据格式:
input(seq_len, batch, input_size)
h0(num_layers * num_directions, batch, hidden_size)
c0(num_layers * num_directions, batch, hidden_size)
输出数据格式:
output(seq_len, batch, hidden_size * num_directions)
hn(num_layers * num_directions, batch, hidden_size)
cn(num_layers * num_directions, batch, hidden_size)
'''
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,batch_size,str_len):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.str_len=str_len
self.batch_size=batch_size
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.dropout = nn.Dropout(0.5)
self.hidden2tag = nn.Linear(str_len*hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
def forward(self, sentence,state,train_flag):
embeds = self.word_embeddings(sentence)
#print(embeds.shape)
self.hidden=state
lstm_out, self.hidden = self.lstm(embeds.view(self.str_len, len(sentence), -1), self.hidden)
#print("ls",lstm_out.shape)
if train_flag:
lstm_out=self.dropout(lstm_out)
tag_space = self.hidden2tag(lstm_out.view(self.batch_size,-1))
#print(tag_space.shape)
#tag_scores = F.log_softmax(tag_space,dim=1)
return tag_space ,self.hidden
model = LSTMTagger(300, 256, len(dict),1,16,31)
model=model.cuda()
print(model)
exit()
def train(model, train_data, train_label,num_epochs, learning_rate, batch_size):
train_ls=[]
#loss =torch.nn.CrossEntropyLoss()
state=None
loss = torch.nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate,weight_decay=0)
dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
iter=0
for epoch in range(num_epochs):
correct = 0
total=0
start = time.time()
index=0
for X, y in train_iter:
iter+=1
index+=1
if index>400:
break
if state is not None:
if isinstance (state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
X = X.long()
(output, state) = model(X, state,True)
#correct+=predicted.data.eq(label.data).cpu().sum()
l = loss(output.float(), y)
l.backward()
optimizer.step()
optimizer.zero_grad()
#print(output)
output = [1 if i > 0.5 else 0 for i in output]
for i in range(len(output)):
total+=1
if output[i]==y[i]:
correct+=1
acc = correct / total
end = time.time()
print("epoch ", str(epoch), "time: ",end-start ," loss: ", l.item(),"correct: ", correct, " total: ", total, "acc: ", acc)
torch.save(model, 'model.pkl')
eval(model, train_data, train_label, batch_size)
train_ls.append(l.item())
return train_ls
Train=train_data
#model = torch.load('model.pkl')
def eval(model,train_data, train_label,batch_size):
test_list=[]
dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
with torch.no_grad():
state = None
index = 0
for X, y in train_iter:
index += 1
if index <=400:
continue
if index >= 470:
break
if state is not None:
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
X = X.long()
(output, state) = model(X, state,False)
output = [1 if i > 0.5 else 0 for i in output]
for i in range(len(output)):
test_list.append(output[i])
correct=0
for i in range(0, len(test_list)):
if train_label[i]==test_list[i]:
correct+=1
print("eval: ",correct/len(test_list))
def out_put(model):
test_list = []
max_len = 31
for x in test_data.text:
while len(x) < max_len:
x.append(0)
if len(x)>max_len:
x =x[0:30]
test_num=len(test_data.text)
#print("sda",test_num)
for i in range(0, len(test_data)):
data_list.append(test_data.iloc[i]['text'])
test_data_tensor = torch.Tensor(data_list)
if USE_CUDA:
print("using GPU")
test_data_tensor = test_data_tensor.cuda()
with torch.no_grad():
result_dataset = torch.utils.data.TensorDataset(test_data_tensor)
result_dataloader = torch.utils.data.DataLoader(result_dataset, batch_size=16, shuffle=False)
state=None
index=0
for X in result_dataloader:
X=X[0]
if index>=203:
break
index+=1
if state is not None:
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
X = X.long()
(output, state) = model(X, state,False)
# correct+=predicted.data.eq(label.data).cpu().sum()
output = [1 if i > 0.5 else 0 for i in output]
for i in range(len(output)):
test_list.append(output[i])
print(len(test_list))
while len(test_list)<3263:
test_list.append(0)
df_output = pd.DataFrame()
aux = pd.read_csv(base_path + 'test.csv')
df_output['id'] = aux['id']
df_output['target'] = test_list
df_output[['id', 'target']].to_csv(base_path + 's1mple.csv', index=False)
train(model,traindata_tensor,train_label,30,0.01,16)
print("\n")
eval(model,traindata_tensor, train_label,16)
out_put(model)