bert

参考代码bert/modeling.py

  1. def attention_layer(from_tensor,
  2. to_tensor,
  3. attention_mask=None,
  4. num_attention_heads=1,
  5. size_per_head=512,
  6. query_act=None,
  7. key_act=None,
  8. value_act=None,
  9. attention_probs_dropout_prob=0.0,
  10. initializer_range=0.02,
  11. do_return_2d_tensor=False,
  12. batch_size=None,
  13. from_seq_length=None,
  14. to_seq_length=None):
  15. """Performs multi-headed attention from `from_tensor` to `to_tensor`.
  16. This is an implementation of multi-headed attention based on "Attention
  17. is all you Need". If `from_tensor` and `to_tensor` are the same, then
  18. this is self-attention. Each timestep in `from_tensor` attends to the
  19. corresponding sequence in `to_tensor`, and returns a fixed-with vector.
  20. This function first projects `from_tensor` into a "query" tensor and
  21. `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  22. of tensors of length `num_attention_heads`, where each tensor is of shape
  23. [batch_size, seq_length, size_per_head].
  24. Then, the query and key tensors are dot-producted and scaled. These are
  25. softmaxed to obtain attention probabilities. The value tensors are then
  26. interpolated by these probabilities, then concatenated back to a single
  27. tensor and returned.
  28. In practice, the multi-headed attention are done with transposes and
  29. reshapes rather than actual separate tensors.
  30. Args:
  31. from_tensor: float Tensor of shape [batch_size, from_seq_length,
  32. from_width].
  33. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
  34. attention_mask: (optional) int32 Tensor of shape [batch_size,
  35. from_seq_length, to_seq_length]. The values should be 1 or 0. The
  36. attention scores will effectively be set to -infinity for any positions in
  37. the mask that are 0, and will be unchanged for positions that are 1.
  38. num_attention_heads: int. Number of attention heads.
  39. size_per_head: int. Size of each attention head.
  40. query_act: (optional) Activation function for the query transform.
  41. key_act: (optional) Activation function for the key transform.
  42. value_act: (optional) Activation function for the value transform.
  43. attention_probs_dropout_prob: (optional) float. Dropout probability of the
  44. attention probabilities.
  45. initializer_range: float. Range of the weight initializer.
  46. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
  47. * from_seq_length, num_attention_heads * size_per_head]. If False, the
  48. output will be of shape [batch_size, from_seq_length, num_attention_heads
  49. * size_per_head].
  50. batch_size: (Optional) int. If the input is 2D, this might be the batch size
  51. of the 3D version of the `from_tensor` and `to_tensor`.
  52. from_seq_length: (Optional) If the input is 2D, this might be the seq length
  53. of the 3D version of the `from_tensor`.
  54. to_seq_length: (Optional) If the input is 2D, this might be the seq length
  55. of the 3D version of the `to_tensor`.
  56. Returns:
  57. float Tensor of shape [batch_size, from_seq_length,
  58. num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
  59. true, this will be of shape [batch_size * from_seq_length,
  60. num_attention_heads * size_per_head]).
  61. Raises:
  62. ValueError: Any of the arguments or tensor shapes are invalid.
  63. """
  64. def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
  65. seq_length, width):
  66. output_tensor = tf.reshape(
  67. input_tensor, [batch_size, seq_length, num_attention_heads, width])
  68. output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
  69. return output_tensor
  70. from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  71. to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
  72. if len(from_shape) != len(to_shape):
  73. raise ValueError(
  74. "The rank of `from_tensor` must match the rank of `to_tensor`.")
  75. if len(from_shape) == 3:
  76. batch_size = from_shape[0]
  77. from_seq_length = from_shape[1]
  78. to_seq_length = to_shape[1]
  79. elif len(from_shape) == 2:
  80. if (batch_size is None or from_seq_length is None or to_seq_length is None):
  81. raise ValueError(
  82. "When passing in rank 2 tensors to attention_layer, the values "
  83. "for `batch_size`, `from_seq_length`, and `to_seq_length` "
  84. "must all be specified.")
  85. # Scalar dimensions referenced here:
  86. # B = batch size (number of sequences)
  87. # F = `from_tensor` sequence length
  88. # T = `to_tensor` sequence length
  89. # N = `num_attention_heads`
  90. # H = `size_per_head`
  91. from_tensor_2d = reshape_to_matrix(from_tensor) #[B*F, from_width]
  92. to_tensor_2d = reshape_to_matrix(to_tensor) #[B*T, to_width]
  93. # `query_layer` = [B*F, N*H]
  94. query_layer = tf.layers.dense(
  95. from_tensor_2d,
  96. num_attention_heads * size_per_head,
  97. activation=query_act,
  98. name="query",
  99. kernel_initializer=create_initializer(initializer_range))
  100. # `key_layer` = [B*T, N*H]
  101. key_layer = tf.layers.dense(
  102. to_tensor_2d,
  103. num_attention_heads * size_per_head,
  104. activation=key_act,
  105. name="key",
  106. kernel_initializer=create_initializer(initializer_range))
  107. # `value_layer` = [B*T, N*H]
  108. value_layer = tf.layers.dense(
  109. to_tensor_2d,
  110. num_attention_heads * size_per_head,
  111. activation=value_act,
  112. name="value",
  113. kernel_initializer=create_initializer(initializer_range))
  114. # `query_layer` = [B, N, F, H]
  115. query_layer = transpose_for_scores(query_layer, batch_size,
  116. num_attention_heads, from_seq_length,
  117. size_per_head)
  118. # `key_layer` = [B, N, T, H]
  119. key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
  120. to_seq_length, size_per_head)
  121. # Take the dot product between "query" and "key" to get the raw
  122. # attention scores.
  123. # `attention_scores` = [B, N, F, T]
  124. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  125. attention_scores = tf.multiply(attention_scores,
  126. 1.0 / math.sqrt(float(size_per_head)))
  127. if attention_mask is not None:
  128. # `attention_mask` = [B, 1, F, T]
  129. attention_mask = tf.expand_dims(attention_mask, axis=[1])
  130. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  131. # masked positions, this operation will create a tensor which is 0.0 for
  132. # positions we want to attend and -10000.0 for masked positions.
  133. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
  134. # Since we are adding it to the raw scores before the softmax, this is
  135. # effectively the same as removing these entirely.
  136. attention_scores += adder
  137. # Normalize the attention scores to probabilities.
  138. # `attention_probs` = [B, N, F, T]
  139. attention_probs = tf.nn.softmax(attention_scores)
  140. # This is actually dropping out entire tokens to attend to, which might
  141. # seem a bit unusual, but is taken from the original Transformer paper.
  142. attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
  143. # `value_layer` = [B, T, N, H]
  144. value_layer = tf.reshape(
  145. value_layer,
  146. [batch_size, to_seq_length, num_attention_heads, size_per_head])
  147. # `value_layer` = [B, N, T, H]
  148. value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
  149. # `context_layer` = [B, N, F, H]
  150. context_layer = tf.matmul(attention_probs, value_layer)
  151. # `context_layer` = [B, F, N, H]
  152. context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
  153. if do_return_2d_tensor:
  154. # `context_layer` = [B*F, N*H]
  155. context_layer = tf.reshape(
  156. context_layer,
  157. [batch_size * from_seq_length, num_attention_heads * size_per_head])
  158. else:
  159. # `context_layer` = [B, F, N*H]
  160. context_layer = tf.reshape(
  161. context_layer,
  162. [batch_size, from_seq_length, num_attention_heads * size_per_head])
  163. return context_layer

attention is all you need

参考代码attention-is-all-you-need-keras/transformer.py

  1. # It's safe to use a 1-d mask for self-attention
  2. class ScaledDotProductAttention():
  3. def __init__(self, attn_dropout=0.1):
  4. self.dropout = Dropout(attn_dropout)
  5. def __call__(self, q, k, v, mask): # mask_k or mask_qk
  6. temper = tf.sqrt(tf.cast(tf.shape(k)[-1], dtype='float32'))
  7. attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/temper)([q, k]) # shape=(batch, q, k)
  8. if mask is not None:
  9. mmask = Lambda(lambda x:(-1e+9)*(1.-K.cast(x, 'float32')))(mask)
  10. attn = Add()([attn, mmask])
  11. attn = Activation('softmax')(attn)
  12. attn = self.dropout(attn)
  13. output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
  14. return output, attn
  15. class MultiHeadAttention():
  16. # mode 0 - big martixes, faster; mode 1 - more clear implementation
  17. def __init__(self, n_head, d_model, dropout, mode=0):
  18. self.mode = mode
  19. self.n_head = n_head
  20. self.d_k = self.d_v = d_k = d_v = d_model // n_head
  21. self.dropout = dropout
  22. if mode == 0:
  23. self.qs_layer = Dense(n_head*d_k, use_bias=False)
  24. self.ks_layer = Dense(n_head*d_k, use_bias=False)
  25. self.vs_layer = Dense(n_head*d_v, use_bias=False)
  26. elif mode == 1:
  27. self.qs_layers = []
  28. self.ks_layers = []
  29. self.vs_layers = []
  30. for _ in range(n_head):
  31. self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
  32. self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
  33. self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
  34. self.attention = ScaledDotProductAttention()
  35. self.w_o = TimeDistributed(Dense(d_model))
  36. def __call__(self, q, k, v, mask=None):
  37. d_k, d_v = self.d_k, self.d_v
  38. n_head = self.n_head
  39. if self.mode == 0:
  40. qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
  41. ks = self.ks_layer(k)
  42. vs = self.vs_layer(v)
  43. def reshape1(x):
  44. s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
  45. x = tf.reshape(x, [s[0], s[1], n_head, s[2]//n_head])
  46. x = tf.transpose(x, [2, 0, 1, 3])
  47. x = tf.reshape(x, [-1, s[1], s[2]//n_head]) # [n_head * batch_size, len_q, d_k]
  48. return x
  49. qs = Lambda(reshape1)(qs)
  50. ks = Lambda(reshape1)(ks)
  51. vs = Lambda(reshape1)(vs)
  52. if mask is not None:
  53. mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
  54. head, attn = self.attention(qs, ks, vs, mask=mask)
  55. def reshape2(x):
  56. s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
  57. x = tf.reshape(x, [n_head, -1, s[1], s[2]])
  58. x = tf.transpose(x, [1, 2, 0, 3])
  59. x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
  60. return x
  61. head = Lambda(reshape2)(head)
  62. elif self.mode == 1:
  63. heads = []; attns = []
  64. for i in range(n_head):
  65. qs = self.qs_layers[i](q)
  66. ks = self.ks_layers[i](k)
  67. vs = self.vs_layers[i](v)
  68. head, attn = self.attention(qs, ks, vs, mask)
  69. heads.append(head); attns.append(attn)
  70. head = Concatenate()(heads) if n_head > 1 else heads[0]
  71. attn = Concatenate()(attns) if n_head > 1 else attns[0]
  72. outputs = self.w_o(head)
  73. outputs = Dropout(self.dropout)(outputs)
  74. return outputs, attn