Bert魔改

阅读源码

初步对bert的态度是作为工具使用
暂时不关注内部的细节
看一看输出和输入即可

  1. def __init__(self, config):
  2. super(BertForSequenceClassification, self).__init__(config)
  3. self.num_labels = config.num_labels
  4. self.bert = BertModel(config)
  5. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  6. self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
  7. self.init_weights()
  8. def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
  9. position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
  10. outputs = self.bert(input_ids,
  11. attention_mask=attention_mask,
  12. token_type_ids=token_type_ids,
  13. position_ids=position_ids,
  14. head_mask=head_mask,
  15. inputs_embeds=inputs_embeds)
  16. pooled_output = outputs[1]
  17. pooled_output = self.dropout(pooled_output)
  18. logits = self.classifier(pooled_output)
  19. outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
  20. if labels is not None:
  21. if self.num_labels == 1:
  22. # We are doing regression
  23. loss_fct = MSELoss()
  24. loss = loss_fct(logits.view(-1), labels.view(-1))
  25. else:
  26. loss_fct = CrossEntropyLoss()
  27. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  28. outputs = (loss,) + outputs
  29. return outputs # (loss), logits, (hidden_states), (attentions)

不难发现本质就是bert的输出,加上了一个dropout层,最后用一个线性的层计算到label上

loss: (optional, returned when labels is provided) torch.FloatTensor of shape (1,): Classification (or regression if config.num_labels==1) loss.

logits: torch.FloatTensor of shape (batch_size, config.num_labels) Classification (or regression if config.num_labels==1) scores (before SoftMax).

_

原模型

  1. class BertModel(BertPreTrainedModel):
  2. """
  3. The model can behave as an encoder (with only self-attention) as well
  4. as a decoder, in which case a layer of cross-attention is added between
  5. the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
  6. Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
  7. To behave as an decoder the model needs to be initialized with the
  8. :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
  9. :obj:`encoder_hidden_states` is expected as an input to the forward pass.
  10. .. _`Attention is all you need`:
  11. https://arxiv.org/abs/1706.03762
  12. """
  13. def __init__(self, config):
  14. super().__init__(config)
  15. self.config = config
  16. self.embeddings = BertEmbeddings(config)
  17. self.encoder = BertEncoder(config)
  18. self.pooler = BertPooler(config)
  19. self.init_weights()
  20. [DOCS] def get_input_embeddings(self):
  21. return self.embeddings.word_embeddings
  22. [DOCS] def set_input_embeddings(self, value):
  23. self.embeddings.word_embeddings = value
  24. def _prune_heads(self, heads_to_prune):
  25. """ Prunes heads of the model.
  26. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
  27. See base class PreTrainedModel
  28. """
  29. for layer, heads in heads_to_prune.items():
  30. self.encoder.layer[layer].attention.prune_heads(heads)
  31. [DOCS] @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
  32. def forward(
  33. self,
  34. input_ids=None,
  35. attention_mask=None,
  36. token_type_ids=None,
  37. position_ids=None,
  38. head_mask=None,
  39. inputs_embeds=None,
  40. encoder_hidden_states=None,
  41. encoder_attention_mask=None,
  42. ):
  43. r"""
  44. Return:
  45. :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
  46. last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
  47. Sequence of hidden-states at the output of the last layer of the model.
  48. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
  49. Last layer hidden-state of the first token of the sequence (classification token)
  50. further processed by a Linear layer and a Tanh activation function. The Linear
  51. layer weights are trained from the next sentence prediction (classification)
  52. objective during pre-training.
  53. This output is usually *not* a good summary
  54. of the semantic content of the input, you're often better with averaging or pooling
  55. the sequence of hidden-states for the whole input sequence.
  56. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
  57. Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
  58. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
  59. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  60. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
  61. Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
  62. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
  63. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  64. heads.
  65. Examples::
  66. from transformers import BertModel, BertTokenizer
  67. import torch
  68. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  69. model = BertModel.from_pretrained('bert-base-uncased')
  70. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
  71. outputs = model(input_ids)
  72. last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
  73. """
  74. if input_ids is not None and inputs_embeds is not None:
  75. raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
  76. elif input_ids is not None:
  77. input_shape = input_ids.size()
  78. elif inputs_embeds is not None:
  79. input_shape = inputs_embeds.size()[:-1]
  80. else:
  81. raise ValueError("You have to specify either input_ids or inputs_embeds")
  82. device = input_ids.device if input_ids is not None else inputs_embeds.device
  83. if attention_mask is None:
  84. attention_mask = torch.ones(input_shape, device=device)
  85. if token_type_ids is None:
  86. token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
  87. # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
  88. # ourselves in which case we just need to make it broadcastable to all heads.
  89. if attention_mask.dim() == 3:
  90. extended_attention_mask = attention_mask[:, None, :, :]
  91. elif attention_mask.dim() == 2:
  92. # Provided a padding mask of dimensions [batch_size, seq_length]
  93. # - if the model is a decoder, apply a causal mask in addition to the padding mask
  94. # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
  95. if self.config.is_decoder:
  96. batch_size, seq_length = input_shape
  97. seq_ids = torch.arange(seq_length, device=device)
  98. causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
  99. causal_mask = causal_mask.to(
  100. attention_mask.dtype
  101. ) # causal and attention masks must have same type with pytorch version < 1.3
  102. extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
  103. else:
  104. extended_attention_mask = attention_mask[:, None, None, :]
  105. else:
  106. raise ValueError(
  107. "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
  108. input_shape, attention_mask.shape
  109. )
  110. )
  111. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  112. # masked positions, this operation will create a tensor which is 0.0 for
  113. # positions we want to attend and -10000.0 for masked positions.
  114. # Since we are adding it to the raw scores before the softmax, this is
  115. # effectively the same as removing these entirely.
  116. extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
  117. extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
  118. # If a 2D ou 3D attention mask is provided for the cross-attention
  119. # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
  120. if self.config.is_decoder and encoder_hidden_states is not None:
  121. encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
  122. encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
  123. if encoder_attention_mask is None:
  124. encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
  125. if encoder_attention_mask.dim() == 3:
  126. encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
  127. elif encoder_attention_mask.dim() == 2:
  128. encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
  129. else:
  130. raise ValueError(
  131. "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
  132. encoder_hidden_shape, encoder_attention_mask.shape
  133. )
  134. )
  135. encoder_extended_attention_mask = encoder_extended_attention_mask.to(
  136. dtype=next(self.parameters()).dtype
  137. ) # fp16 compatibility
  138. encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
  139. else:
  140. encoder_extended_attention_mask = None
  141. # Prepare head mask if needed
  142. # 1.0 in head_mask indicate we keep the head
  143. # attention_probs has shape bsz x n_heads x N x N
  144. # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
  145. # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
  146. if head_mask is not None:
  147. if head_mask.dim() == 1:
  148. head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
  149. head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
  150. elif head_mask.dim() == 2:
  151. head_mask = (
  152. head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
  153. ) # We can specify head_mask for each layer
  154. head_mask = head_mask.to(
  155. dtype=next(self.parameters()).dtype
  156. ) # switch to fload if need + fp16 compatibility
  157. else:
  158. head_mask = [None] * self.config.num_hidden_layers
  159. embedding_output = self.embeddings(
  160. input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
  161. )
  162. encoder_outputs = self.encoder(
  163. embedding_output,
  164. attention_mask=extended_attention_mask,
  165. head_mask=head_mask,
  166. encoder_hidden_states=encoder_hidden_states,
  167. encoder_attention_mask=encoder_extended_attention_mask,
  168. )
  169. sequence_output = encoder_outputs[0]
  170. pooled_output = self.pooler(sequence_output)
  171. outputs = (sequence_output, pooled_output,) + encoder_outputs[
  172. 1:
  173. ] # add hidden_states and attentions if they are here
  174. return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
  175. [DOCS]@add_start_docstrings(
  176. """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
  177. a `next sentence prediction (classification)` head. """,
  178. BERT_START_DOCSTRING,
  179. )
  1. from transformers import BertModel, BertTokenizer
  2. import torch
  3. tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
  4. model = BertModel.from_pretrained('bert-base-chinese')
  5. input_ids = torch.tensor(tokenizer.encode("美国", add_special_tokens=True)).unsqueeze(0) # Batch size 1
  6. print(input_ids)
  7. print(input_ids.shape)
  8. outputs = model(input_ids)
  9. #print(outputs)
  10. last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
  11. print(last_hidden_states.shape)
  12. others = outputs[1]
  13. print(others.shape)

image.png

代码源头

https://github.com/huggingface/transformers/tree/master/examples/ner
利用pytorch实现的bert_ner
https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertForSequenceClassification