数据处理

原文本格式

  1. {"text": "他们需要1分确保小组出线。出线形势要求赫塔必须全力争胜。interwetten相同赔率下,", "label": {"organization": {"赫塔": [[19, 20]], "interwetten": [[28, 38]]}}}
  2. {"text": "20雷池,本场无冷迹象。", "label": {"address": {"雷池": [[2, 3]]}}}

使用readjson进行读取

  1. def _read_json(self,input_file):
  2. lines = []
  3. with open(input_file,'r',encoding='UTF-8') as f:
  4. for line in f:
  5. line = json.loads(line.strip())
  6. text = line['text']
  7. label_entities = line.get('label',None)
  8. words = list(text)
  9. labels = ['O'] * len(words)
  10. if label_entities is not None:
  11. for key,value in label_entities.items():
  12. for sub_name,sub_index in value.items():
  13. for start_index,end_index in sub_index:
  14. assert ''.join(words[start_index:end_index+1]) == sub_name
  15. if start_index == end_index:
  16. labels[start_index] = 'S-'+key
  17. else:
  18. labels[start_index] = 'B-'+key
  19. labels[start_index+1:end_index+1] = ['I-'+key]*(len(sub_name)-1)
  20. lines.append({"words": words, "labels": labels})
  21. return lines

最终的读取结果是标注的BIOS或者是BIO的格式,一个是每一个字的序列,另一个是标签序列
这里用的是BIOS
简单的说就是B开头的tag是一个词的开始,I开头的Tag是一个词的身体
一个实体就是B开头一直到O,或者是B,反正不是I就说明读取到头了
S没太注意代表啥,应该是只有一个字的实体吧?猜的。
image.png
接着读取的结果进入_create_examples,进而进入get_entities
由于上述的读取函数的处理,使得后面的两个函数支持标准的数据集格式读写

  1. class CluenerProcessor(DataProcessor):
  2. """Processor for the chinese ner data set."""
  3. def get_train_examples(self, data_dir):
  4. """See base class."""
  5. return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train")
  6. def get_dev_examples(self, data_dir):
  7. """See base class."""
  8. return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev")
  9. def get_test_examples(self, data_dir):
  10. """See base class."""
  11. return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test")
  12. def get_labels(self):
  13. """See base class."""
  14. return ["O", "address", "book","company",'game','government','movie','name','organization','position','scene']
  15. def _create_examples(self, lines, set_type):
  16. """Creates examples for the training and dev sets."""
  17. examples = []
  18. for (i, line) in enumerate(lines):
  19. guid = "%s-%s" % (set_type, i)
  20. text_a = line['words']
  21. labels = line['labels']
  22. subject = get_entities(labels,id2label=None,markup='bios')
  23. examples.append(InputExample(guid=guid, text_a=text_a, subject=subject))
  24. return examples
  25. ner_processors = {
  26. 'cluener':CluenerProcessor
  27. }
  1. def get_entities(seq,id2label,markup='bios'):
  2. '''
  3. :param seq:
  4. :param id2label:
  5. :param markup:
  6. :return:
  7. '''
  8. assert markup in ['bio','bios']
  9. if markup =='bio':
  10. return get_entity_bio(seq,id2label)
  11. else:
  12. return get_entity_bios(seq,id2label)
  1. def get_entities(seq,id2label,markup='bios'):
  2. '''
  3. :param seq:
  4. :param id2label:
  5. :param markup:
  6. :return:
  7. '''
  8. assert markup in ['bio','bios']
  9. if markup =='bio':
  10. return get_entity_bio(seq,id2label)
  11. else:
  12. return get_entity_bios(seq,id2label)
  1. def get_entity_bios(seq,id2label):
  2. """Gets entities from sequence.
  3. note: BIOS
  4. Args:
  5. seq (list): sequence of labels.
  6. Returns:
  7. list: list of (chunk_type, chunk_start, chunk_end).
  8. Example:
  9. # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC']
  10. # >>> get_entity_bios(seq)
  11. [['PER', 0,1], ['LOC', 3, 3]]
  12. """
  13. chunks = []
  14. chunk = [-1, -1, -1]
  15. for indx, tag in enumerate(seq):
  16. if not isinstance(tag, str):
  17. tag = id2label[tag]
  18. if tag.startswith("S-"):
  19. if chunk[2] != -1:
  20. chunks.append(chunk)
  21. chunk = [-1, -1, -1]
  22. chunk[1] = indx
  23. chunk[2] = indx
  24. chunk[0] = tag.split('-')[1]
  25. chunks.append(chunk)
  26. chunk = (-1, -1, -1)
  27. if tag.startswith("B-"):
  28. if chunk[2] != -1:
  29. chunks.append(chunk)
  30. chunk = [-1, -1, -1]
  31. chunk[1] = indx
  32. chunk[0] = tag.split('-')[1]
  33. elif tag.startswith('I-') and chunk[1] != -1:
  34. _type = tag.split('-')[1]
  35. if _type == chunk[0]:
  36. chunk[2] = indx
  37. if indx == len(seq) - 1:
  38. chunks.append(chunk)
  39. else:
  40. if chunk[2] != -1:
  41. chunks.append(chunk)
  42. chunk = [-1, -1, -1]
  43. return chunks
  1. def collate_fn(batch):
  2. """
  3. batch should be a list of (sequence, target, length) tuples...
  4. Returns a padded tensor of sequences sorted from longest to shortest,
  5. """
  6. all_input_ids, all_input_mask, all_segment_ids, all_start_ids,all_end_ids,all_lens = map(torch.stack, zip(*batch))
  7. max_len = max(all_lens).item()
  8. all_input_ids = all_input_ids[:, :max_len]
  9. all_input_mask = all_input_mask[:, :max_len]
  10. all_segment_ids = all_segment_ids[:, :max_len]
  11. all_start_ids = all_start_ids[:,:max_len]
  12. all_end_ids = all_end_ids[:, :max_len]
  13. return all_input_ids, all_input_mask, all_segment_ids, all_start_ids,all_end_ids,all_lens

下面使用的处理函数
使用注释简单的说明做的事情

  1. def convert_examples_to_features(examples,label_list,max_seq_length,tokenizer,
  2. cls_token_at_end=False,cls_token="[CLS]",cls_token_segment_id=1,
  3. sep_token="[SEP]",pad_on_left=False,pad_token=0,pad_token_segment_id=0,
  4. sequence_a_segment_id=0,mask_padding_with_zero=True,):
  5. """ Loads a data file into a list of `InputBatch`s
  6. `cls_token_at_end` define the location of the CLS token:
  7. - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
  8. - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
  9. `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
  10. """
  11. label2id = {label: i for i, label in enumerate(label_list)}
  12. features = []
  13. for (ex_index, example) in enumerate(examples):
  14. if ex_index % 10000 == 0:
  15. logger.info("Writing example %d of %d", ex_index, len(examples))
  16. textlist = example.text_a
  17. subjects = example.subject
  18. tokens = tokenizer.tokenize(textlist)
  19. start_ids = [0] * len(tokens)
  20. end_ids = [0] * len(tokens)
  21. subjects_id = []
  22. for subject in subjects:
  23. label = subject[0]
  24. start = subject[1]
  25. end = subject[2]
  26. # Subjuct examples
  27. # subject [['organization', 3, 4]]
  28. # fist Name , Second start pos , third end pos
  29. start_ids[start] = label2id[label]
  30. end_ids[end] = label2id[label]
  31. subjects_id.append((label2id[label], start, end))
  32. # Account for [CLS] and [SEP] with "- 2".
  33. special_tokens_count = 2
  34. # 截断
  35. if len(tokens) > max_seq_length - special_tokens_count:
  36. tokens = tokens[: (max_seq_length - special_tokens_count)]
  37. start_ids = start_ids[: (max_seq_length - special_tokens_count)]
  38. end_ids = end_ids[: (max_seq_length - special_tokens_count)]
  39. # The convention in BERT is:
  40. # (a) For sequence pairs:
  41. # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  42. # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
  43. # (b) For single sequences:
  44. # tokens: [CLS] the dog is hairy . [SEP]
  45. # type_ids: 0 0 0 0 0 0 0
  46. #
  47. # Where "type_ids" are used to indicate whether this is the first
  48. # sequence or the second sequence. The embedding vectors for `type=0` and
  49. # `type=1` were learned during pre-training and are added to the wordpiece
  50. # embedding vector (and position vector). This is not *strictly* necessary
  51. # since the [SEP] token unambiguously separates the sequences, but it makes
  52. # it easier for the model to learn the concept of sequences.
  53. #
  54. # For classification tasks, the first vector (corresponding to [CLS]) is
  55. # used as as the "sentence vector". Note that this only makes sense because
  56. # the entire model is fine-tuned.
  57. #加上末尾标志
  58. tokens += [sep_token]
  59. start_ids += [0]
  60. end_ids += [0]
  61. # 这个Segment_id有社么用?????
  62. segment_ids = [sequence_a_segment_id] * len(tokens)
  63. if cls_token_at_end:
  64. tokens += [cls_token]
  65. start_ids += [0]
  66. end_ids += [0]
  67. segment_ids += [cls_token_segment_id]
  68. else:
  69. tokens = [cls_token] + tokens
  70. start_ids = [0]+ start_ids
  71. end_ids = [0]+ end_ids
  72. segment_ids = [cls_token_segment_id] + segment_ids
  73. input_ids = tokenizer.convert_tokens_to_ids(tokens)
  74. # The mask has 1 for real tokens and 0 for padding tokens. Only real
  75. # tokens are attended to.
  76. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
  77. input_len = len(input_ids)
  78. # Zero-pad up to the sequence length.
  79. padding_length = max_seq_length - len(input_ids)
  80. if pad_on_left:
  81. input_ids = ([pad_token] * padding_length) + input_ids
  82. input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
  83. segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
  84. start_ids = ([0] * padding_length) + start_ids
  85. end_ids = ([0] * padding_length) + end_ids
  86. else:
  87. input_ids += [pad_token] * padding_length
  88. input_mask += [0 if mask_padding_with_zero else 1] * padding_length
  89. segment_ids += [pad_token_segment_id] * padding_length
  90. start_ids += ([0] * padding_length)
  91. end_ids += ([0] * padding_length)
  92. assert len(input_ids) == max_seq_length
  93. assert len(input_mask) == max_seq_length
  94. assert len(segment_ids) == max_seq_length
  95. assert len(start_ids) == max_seq_length
  96. assert len(end_ids) == max_seq_length
  97. '''
  98. if ex_index < 5:
  99. logger.info("*** Example ***")
  100. logger.info("guid: %s", example.guid)
  101. logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
  102. logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
  103. logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
  104. logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
  105. logger.info("start_ids: %s" % " ".join([str(x) for x in start_ids]))
  106. logger.info("end_ids: %s" % " ".join([str(x) for x in end_ids]))
  107. '''
  108. features.append(InputFeature(input_ids=input_ids,
  109. input_mask=input_mask,
  110. segment_ids=segment_ids,
  111. start_ids=start_ids,
  112. end_ids=end_ids,
  113. subjects=subjects_id,
  114. input_len=input_len))
  115. return features

没看明白Segment_id是为了做什么的
看上去和普通的Token差不多
后续说的是XLM和RoBerta不用这个,Xlnet和Bert用。。。没看懂

  1. all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  2. all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  3. all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
  4. all_start_ids = torch.tensor([f.start_ids for f in features], dtype=torch.long)
  5. all_end_ids = torch.tensor([f.end_ids for f in features], dtype=torch.long)
  6. all_input_lens = torch.tensor([f.input_len for f in features], dtype=torch.long)
  7. dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_ids,all_end_ids,all_input_lens)

后面输入给模型

  1. inputs = {"input_ids": batch[0], "attention_mask": batch[1],
  2. "start_positions": batch[3],"end_positions": batch[4]}
  3. if args.model_type != "distilbert":
  4. # XLM and RoBERTa don"t use segment_ids
  5. inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)

模型构造

  1. class BertSpanForNer(BertPreTrainedModel):
  2. def __init__(self, config,):
  3. super(BertSpanForNer, self).__init__(config)
  4. self.soft_label = config.soft_label
  5. self.num_labels = config.num_labels
  6. self.loss_type = config.loss_type
  7. self.bert = BertModel(config)
  8. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  9. self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels)
  10. if self.soft_label:
  11. self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels)
  12. else:
  13. self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels)
  14. self.init_weights()
  15. def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None):
  16. outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
  17. sequence_output = outputs[0]
  18. sequence_output = self.dropout(sequence_output)
  19. start_logits = self.start_fc(sequence_output)
  20. if start_positions is not None and self.training:
  21. if self.soft_label:
  22. batch_size = input_ids.size(0)
  23. seq_len = input_ids.size(1)
  24. label_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels)
  25. label_logits.zero_()
  26. label_logits = label_logits.to(input_ids.device)
  27. label_logits.scatter_(2, start_positions.unsqueeze(2), 1)
  28. else:
  29. label_logits = start_positions.unsqueeze(2).float()
  30. else:
  31. label_logits = F.softmax(start_logits, -1)
  32. if not self.soft_label:
  33. label_logits = torch.argmax(label_logits, -1).unsqueeze(2).float()
  34. end_logits = self.end_fc(sequence_output, label_logits)
  35. outputs = (start_logits, end_logits,) + outputs[2:]
  36. if start_positions is not None and end_positions is not None:
  37. assert self.loss_type in ['lsr', 'focal', 'ce']
  38. if self.loss_type =='lsr':
  39. loss_fct = LabelSmoothingCrossEntropy()
  40. elif self.loss_type == 'focal':
  41. loss_fct = FocalLoss()
  42. else:
  43. loss_fct = CrossEntropyLoss()
  44. start_logits = start_logits.view(-1, self.num_labels)
  45. end_logits = end_logits.view(-1, self.num_labels)
  46. active_loss = attention_mask.view(-1) == 1
  47. active_start_logits = start_logits[active_loss]
  48. active_end_logits = end_logits[active_loss]
  49. active_start_labels = start_positions.view(-1)[active_loss]
  50. active_end_labels = end_positions.view(-1)[active_loss]
  51. start_loss = loss_fct(active_start_logits, active_start_labels)
  52. end_loss = loss_fct(active_end_logits, active_end_labels)
  53. total_loss = (start_loss + end_loss) / 2
  54. outputs = (total_loss,) + outputs
  55. return outputs

模型的输出为一个Tuple类型
第一项的 output0 : torch.Size([8, 50, 768])应该是序列数据对应的输出
第二项的 output1 :torch.Size([8, 768])应该是整个句子的编码
本次用的是第一项,之前用Bert做句子分类的时候
看demo似乎用的就是第二项
image.png

输出分为两个部分
第一个是计算开始的位置
start_position是传进来的参数
使用scatter的用法,初步的理解就是在对应的开始的位置填充上1,每一个句子里可能的实体对应的位置为1

  1. sequence_output = outputs[0]
  2. sequence_output = self.dropout(sequence_output)
  3. start_logits = self.start_fc(sequence_output)
  4. if start_positions is not None and self.training:
  5. if self.soft_label:
  6. #print("hit here")
  7. batch_size = input_ids.size(0)
  8. seq_len = input_ids.size(1)
  9. label_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels)
  10. label_logits.zero_()
  11. label_logits = label_logits.to(input_ids.device)
  12. label_logits.scatter_(2, start_positions.unsqueeze(2), 1)

之前的attention mask标记了句子的真实长度
这里相当于就是截断,重新得到真实的句子长度

  1. active_loss = attention_mask.view(-1) == 1
  2. active_start_logits = start_logits[active_loss]
  3. active_end_logits = end_logits[active_loss]

最后计算得到了X个Batch里句子的具体长度和Y的每一个位置的值,以及对应的Start_id的分布
比如句子长度是100,则每一个位置的值start_logits就是100个奇奇怪怪的数字,start_id就是这句子里所有的实体的开始位置上有着对应实体类别的值,对应的送入损失函数计算。同理计算得到的end_id和对应end_label进入计算
输入的格式就是【length,labels_nums】和【length】,多种损失函数,默认交叉熵损失函数
这里还是第一次这么这种的多个分类问题的损失函数
最后的最后就是用两个Loss的平均值作为最后的loss

训练准备

一系列的预处理
可能是由于需要自己构造模型,所以在训练前进行了大量的指定
关于优化器和参数decay
还有一个新的东西就是指定了Warmup

  1. """ Train the model """
  2. args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
  3. train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
  4. train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
  5. collate_fn=collate_fn)
  6. if args.max_steps > 0:
  7. t_total = args.max_steps
  8. args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
  9. else:
  10. t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
  11. # Prepare optimizer and schedule (linear warmup and decay)
  12. no_decay = ["bias", "LayerNorm.weight"]
  13. optimizer_grouped_parameters = [
  14. {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
  15. "weight_decay": args.weight_decay,},
  16. {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
  17. ]
  18. optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
  19. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
  20. num_training_steps=t_total)

预测设置

最后在predict的时候需要函数转化出可视化的结果
由于先前的计算可以知道start_logits和end_logits就是当前batch的句子的计算结果:
【batch_num,length,labels_num】
在预测的时候batch_num设置为1,也就是一次一个句子
torch.argmax会得到在labels_num维度上的最大值的下标,也就是最大的值所在的索引,最后就是一个列表,长度是【length】。里面如果不是实体的起点或者是终点就是0(应该也是类型的预设之一:无类型),反之就是类型的编号。最后如果找到了不为1的数字,就到end列表里找标号相等的结尾标记。算法结束。

  1. def bert_extract_item(start_logits, end_logits):
  2. S = []
  3. start_pred = torch.argmax(start_logits, -1).cpu().numpy()[0][1:-1]
  4. end_pred = torch.argmax(end_logits, -1).cpu().numpy()[0][1:-1]
  5. for i, s_l in enumerate(start_pred):
  6. if s_l == 0:
  7. continue
  8. for j, e_l in enumerate(end_pred[i:]):
  9. if s_l == e_l:
  10. S.append((s_l, i, i + j))
  11. break
  12. return S

这一段话说的是type_id是用来指示这是第一句话还是第二句话
第一句话是0,第二句话是1。
对应的0/1向量在预训练里被学习,然后加入词向量和位置向量
由于SEP明确的区分了两个句子,所以这个并不是严格有用的,但是这让模型更加容易的理解序列的概念
对于分类任务而言,句子首向量(CLS对应的向量)被作为代表整个句子的向量。
这个只有在整个模型被微调后才有效

  1. # The convention in BERT is:
  2. # (a) For sequence pairs:
  3. # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  4. # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
  5. # (b) For single sequences:
  6. # tokens: [CLS] the dog is hairy . [SEP]
  7. # type_ids: 0 0 0 0 0 0 0
  8. #
  9. # Where "type_ids" are used to indicate whether this is the first
  10. # sequence or the second sequence. The embedding vectors for `type=0` and
  11. # `type=1` were learned during pre-training and are added to the wordpiece
  12. # embedding vector (and position vector). This is not *strictly* necessary
  13. # since the [SEP] token unambiguously separates the sequences, but it makes
  14. # it easier for the model to learn the concept of sequences.
  15. #
  16. # For classification tasks, the first vector (corresponding to [CLS]) is
  17. # used as as the "sentence vector". Note that this only makes sense because
  18. # the entire model is fine-tuned.