问题

对词性进行标注(demo展示)

基本思路

  1. 编码词
  2. 得到编码的句子序列
  3. 输入LSTM
  4. LSTM输入Linear
  5. 经过log_softmax
  6. 取max为预测对象进行Loss计算
  7. loss_function = nn.NLLLoss()
  8. optimizer = optim.SGD(model.parameters(), lr=0.1)

代码

原理见另一个知识库笔记

  1. import torch
  2. import torch.autograd as autograd
  3. import torch.nn as nn
  4. import torch.nn.functional as F
  5. import torch.optim as optim
  6. import numpy
  7. torch.manual_seed(1)
  8. '''
  9. net = nn.LSTM(3, 4, bidirectional=True, batch_first=True) # 隐层尺度为4
  10. net2 = nn.LSTM(3, 4, bidirectional=False, batch_first=True)
  11. x = torch.rand(1, 5, 3) # 序列长度为5,输入尺度为3
  12. y=net(x)
  13. y2=net2(x)
  14. print(len(y))
  15. print(len(y2))
  16. print(y[0].shape)
  17. print(y2[0].shape)
  18. #print(y[1].shape)
  19. #print(y.shape)
  20. '''
  21. training_data = [
  22. ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
  23. ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
  24. ]
  25. word_to_ix = {} # 单词的索引字典
  26. for sent, tags in training_data:
  27. for word in sent:
  28. if word not in word_to_ix:
  29. word_to_ix[word] = len(word_to_ix)
  30. print(word_to_ix)
  31. tag_to_ix = {"DET": 0, "NN": 1, "V": 2} # 手工设定词性标签数据字典
  32. index_to_tag={0:"DET", 1:"NN",2:"V"}
  33. class LSTMTagger(nn.Module):
  34. def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
  35. super(LSTMTagger, self).__init__()
  36. self.hidden_dim = hidden_dim
  37. self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
  38. self.lstm = nn.LSTM(embedding_dim, hidden_dim)
  39. self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
  40. self.hidden = self.init_hidden()
  41. def init_hidden(self):
  42. return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
  43. autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
  44. def forward(self, sentence):
  45. embeds = self.word_embeddings(sentence)
  46. lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
  47. tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
  48. tag_scores = F.log_softmax(tag_space,dim=1)
  49. return tag_scores
  50. def prepare_sequence(seq, to_ix):
  51. idxs = [to_ix[w] for w in seq]
  52. tensor = torch.LongTensor(idxs)
  53. return autograd.Variable(tensor)
  54. model = LSTMTagger(300, 100, len(word_to_ix), len(tag_to_ix))
  55. loss_function = nn.NLLLoss()
  56. optimizer = optim.SGD(model.parameters(), lr=0.1)
  57. inputs = prepare_sequence(training_data[0][0], word_to_ix)
  58. tag_scores = model(inputs)
  59. print(training_data[0][0])
  60. print(inputs)
  61. print(tag_scores)
  62. for epoch in range(300):
  63. for sentence, tags in training_data:
  64. model.zero_grad()
  65. model.hidden = model.init_hidden()
  66. sentence_in = prepare_sequence(sentence, word_to_ix)
  67. #print(type(sentence_in))
  68. #print((sentence_in.shape))
  69. targets = prepare_sequence(tags, tag_to_ix)
  70. tag_scores = model(sentence_in)
  71. loss = loss_function(tag_scores, targets)
  72. loss.backward()
  73. optimizer.step()
  74. # 来检验下模型训练的结果
  75. inputs = prepare_sequence(training_data[0][0], word_to_ix)
  76. tag_scores = model(inputs)
  77. index=torch.max(tag_scores,1)[1].data.numpy()
  78. index=index.tolist()
  79. result=[index_to_tag[i] for i in index]
  80. print(result)

结果

利用第一句数据为预测对象进行预测,得到完全正确的结果
当然了数据这么少,还用数据测试不太妥,不过作为demo展示
image.png