参考与主要要点:

数据样例

  1. B-LOC
  2. E-LOC
  3. O
  4. B-PER
  5. I-PER
  6. E-PER
  7. O
  8. O
  9. O
  10. O
  11. O
  12. O
  13. O

代码

数据处理

  1. import numpy
  2. from collections import Counter
  3. from keras.preprocessing.sequence import pad_sequences
  4. import pickle
  5. import platform
  6. import sys
  7. sys.path.append('/tf/keras/keras-contrib')
  8. def _parse_data(fh):
  9. # in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
  10. # you have to use recorsponding instructions
  11. if platform.system() == 'Windows':
  12. # split_text = '\r\n' #linux
  13. split_text = '\n' #windows
  14. else:
  15. split_text = '\n'
  16. raw_Cropat = fh.read().decode('utf-8')
  17. data = [[row.split() for row in sample.split(split_text)] for
  18. sample in
  19. raw_Cropat.strip().split(split_text + split_text)]
  20. fh.close()
  21. return data
  22. def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
  23. if maxlen is None:
  24. maxlen = max(len(s) for s in data)
  25. word2idx = {w:i for i, w in enumerate(vocab)}
  26. x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
  27. y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
  28. x = pad_sequences(x, maxlen) # left padding
  29. y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
  30. if onehot:
  31. y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk]
  32. else:
  33. y_chunk = numpy.expand_dims(y_chunk, 2)
  34. return x, y_chunk
  35. def process_data(data, vocab, maxlen=100):
  36. word2idx = dict((w, i) for i, w in enumerate(vocab))
  37. x = [word2idx.get(w[0].lower(), 1) for w in data]
  38. length = len(x)
  39. x = pad_sequences([x], maxlen) # left padding
  40. return x, length
  41. def load_data():
  42. train = _parse_data(open('demo.train.char', 'rb'))
  43. test = _parse_data(open('demo.test.char', 'rb'))
  44. word_counts = Counter(row[0].lower() for sample in train for row in sample)
  45. vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
  46. chunk_tags = list(set([ line[1] for oneDev in train for line in oneDev]))
  47. # save initial config data
  48. with open('config.pkl', 'wb') as outp:
  49. pickle.dump((vocab, chunk_tags), outp)
  50. train = _process_data(train, vocab, chunk_tags)
  51. test = _process_data(test, vocab, chunk_tags)
  52. return train, test, (vocab, chunk_tags)

搭建模型

  1. from keras.models import Sequential
  2. from keras.layers import Embedding, Bidirectional, LSTM
  3. from keras_contrib.layers import CRF
  4. import pickle
  5. EMBED_DIM = 200
  6. BiRNN_UNITS = 200
  7. def create_model(train=True):
  8. if train:
  9. (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
  10. else:
  11. with open('model/config.pkl', 'rb') as inp:
  12. (vocab, chunk_tags) = pickle.load(inp)
  13. model = Sequential()
  14. model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  15. model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  16. crf = CRF(len(chunk_tags), sparse_target=True)
  17. model.add(crf)
  18. model.summary()
  19. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  20. if train:
  21. return model, (train_x, train_y), (test_x, test_y)
  22. else:
  23. return model, (vocab, chunk_tags)
  24. if __name__=="__main__":
  25. EPOCHS = 10
  26. model, (train_x, train_y), (test_x, test_y) = create_model()
  27. # train model
  28. model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y])
  29. model.save('model/crf.h5')

预测

  1. import numpy as np
  2. with open('config.pkl', 'rb') as inp:
  3. (vocab, chunk_tags) = pickle.load(inp)
  4. predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
  5. sequence, length = process_data(predict_text, vocab)
  6. model.load_weights('crf.h5')
  7. raw = model.predict(sequence)[0][-length:]
  8. result = [np.argmax(row) for row in raw]
  9. result_tags = [chunk_tags[i] for i in result]
  10. per, loc, org = '', '', ''
  11. for s, t in zip(predict_text, result_tags):
  12. if t in ('B-PER', 'M-PER', 'S-PER','E-PER'):
  13. per += ' ' + s if (t == 'B-PER') else s
  14. if t in ('B-ORG', 'M-ORG', 'S-ORG','E-ORG'):
  15. org += ' ' + s if (t == 'B-ORG') else s
  16. if t in ('B-LOC', 'M-LOC', 'S-LOC','E-LOC'):
  17. loc += ' ' + s if (t == 'B-LOC') else s
  18. print(['person:' + per, 'location:' + loc, 'organzation:' + org])

报错处理

  1. TypeError: Tensors in list passed to ‘values’ of ‘ConcatV2’ Op have types [bool, float32] that don’t all match.

删除:Embedding层的 mask_zero=True

mask_zero: 是否把 0 看作为一个应该被遮蔽的特殊的 “padding” 值。 这对于可变长的循环神经网络层 十分有用。 如果设定为 True,那么接下来的所有层都必须支持 masking,否则就会抛出异常。 如果 mask_zero 为 True,作为结果,索引 0 就不能被用于词汇表中 (input_dim 应该与 vocabulary + 1 大小相同)。