bert
参考代码bert/modeling.py。
def attention_layer(from_tensor,to_tensor,attention_mask=None,num_attention_heads=1,size_per_head=512,query_act=None,key_act=None,value_act=None,attention_probs_dropout_prob=0.0,initializer_range=0.02,do_return_2d_tensor=False,batch_size=None,from_seq_length=None,to_seq_length=None):"""Performs multi-headed attention from `from_tensor` to `to_tensor`.This is an implementation of multi-headed attention based on "Attentionis all you Need". If `from_tensor` and `to_tensor` are the same, thenthis is self-attention. Each timestep in `from_tensor` attends to thecorresponding sequence in `to_tensor`, and returns a fixed-with vector.This function first projects `from_tensor` into a "query" tensor and`to_tensor` into "key" and "value" tensors. These are (effectively) a listof tensors of length `num_attention_heads`, where each tensor is of shape[batch_size, seq_length, size_per_head].Then, the query and key tensors are dot-producted and scaled. These aresoftmaxed to obtain attention probabilities. The value tensors are theninterpolated by these probabilities, then concatenated back to a singletensor and returned.In practice, the multi-headed attention are done with transposes andreshapes rather than actual separate tensors.Args:from_tensor: float Tensor of shape [batch_size, from_seq_length,from_width].to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].attention_mask: (optional) int32 Tensor of shape [batch_size,from_seq_length, to_seq_length]. The values should be 1 or 0. Theattention scores will effectively be set to -infinity for any positions inthe mask that are 0, and will be unchanged for positions that are 1.num_attention_heads: int. Number of attention heads.size_per_head: int. Size of each attention head.query_act: (optional) Activation function for the query transform.key_act: (optional) Activation function for the key transform.value_act: (optional) Activation function for the value transform.attention_probs_dropout_prob: (optional) float. Dropout probability of theattention probabilities.initializer_range: float. Range of the weight initializer.do_return_2d_tensor: bool. If True, the output will be of shape [batch_size* from_seq_length, num_attention_heads * size_per_head]. If False, theoutput will be of shape [batch_size, from_seq_length, num_attention_heads* size_per_head].batch_size: (Optional) int. If the input is 2D, this might be the batch sizeof the 3D version of the `from_tensor` and `to_tensor`.from_seq_length: (Optional) If the input is 2D, this might be the seq lengthof the 3D version of the `from_tensor`.to_seq_length: (Optional) If the input is 2D, this might be the seq lengthof the 3D version of the `to_tensor`.Returns:float Tensor of shape [batch_size, from_seq_length,num_attention_heads * size_per_head]. (If `do_return_2d_tensor` istrue, this will be of shape [batch_size * from_seq_length,num_attention_heads * size_per_head]).Raises:ValueError: Any of the arguments or tensor shapes are invalid."""def transpose_for_scores(input_tensor, batch_size, num_attention_heads,seq_length, width):output_tensor = tf.reshape(input_tensor, [batch_size, seq_length, num_attention_heads, width])output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])return output_tensorfrom_shape = get_shape_list(from_tensor, expected_rank=[2, 3])to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])if len(from_shape) != len(to_shape):raise ValueError("The rank of `from_tensor` must match the rank of `to_tensor`.")if len(from_shape) == 3:batch_size = from_shape[0]from_seq_length = from_shape[1]to_seq_length = to_shape[1]elif len(from_shape) == 2:if (batch_size is None or from_seq_length is None or to_seq_length is None):raise ValueError("When passing in rank 2 tensors to attention_layer, the values ""for `batch_size`, `from_seq_length`, and `to_seq_length` ""must all be specified.")# Scalar dimensions referenced here:# B = batch size (number of sequences)# F = `from_tensor` sequence length# T = `to_tensor` sequence length# N = `num_attention_heads`# H = `size_per_head`from_tensor_2d = reshape_to_matrix(from_tensor) #[B*F, from_width]to_tensor_2d = reshape_to_matrix(to_tensor) #[B*T, to_width]# `query_layer` = [B*F, N*H]query_layer = tf.layers.dense(from_tensor_2d,num_attention_heads * size_per_head,activation=query_act,name="query",kernel_initializer=create_initializer(initializer_range))# `key_layer` = [B*T, N*H]key_layer = tf.layers.dense(to_tensor_2d,num_attention_heads * size_per_head,activation=key_act,name="key",kernel_initializer=create_initializer(initializer_range))# `value_layer` = [B*T, N*H]value_layer = tf.layers.dense(to_tensor_2d,num_attention_heads * size_per_head,activation=value_act,name="value",kernel_initializer=create_initializer(initializer_range))# `query_layer` = [B, N, F, H]query_layer = transpose_for_scores(query_layer, batch_size,num_attention_heads, from_seq_length,size_per_head)# `key_layer` = [B, N, T, H]key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,to_seq_length, size_per_head)# Take the dot product between "query" and "key" to get the raw# attention scores.# `attention_scores` = [B, N, F, T]attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)attention_scores = tf.multiply(attention_scores,1.0 / math.sqrt(float(size_per_head)))if attention_mask is not None:# `attention_mask` = [B, 1, F, T]attention_mask = tf.expand_dims(attention_mask, axis=[1])# Since attention_mask is 1.0 for positions we want to attend and 0.0 for# masked positions, this operation will create a tensor which is 0.0 for# positions we want to attend and -10000.0 for masked positions.adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0# Since we are adding it to the raw scores before the softmax, this is# effectively the same as removing these entirely.attention_scores += adder# Normalize the attention scores to probabilities.# `attention_probs` = [B, N, F, T]attention_probs = tf.nn.softmax(attention_scores)# This is actually dropping out entire tokens to attend to, which might# seem a bit unusual, but is taken from the original Transformer paper.attention_probs = dropout(attention_probs, attention_probs_dropout_prob)# `value_layer` = [B, T, N, H]value_layer = tf.reshape(value_layer,[batch_size, to_seq_length, num_attention_heads, size_per_head])# `value_layer` = [B, N, T, H]value_layer = tf.transpose(value_layer, [0, 2, 1, 3])# `context_layer` = [B, N, F, H]context_layer = tf.matmul(attention_probs, value_layer)# `context_layer` = [B, F, N, H]context_layer = tf.transpose(context_layer, [0, 2, 1, 3])if do_return_2d_tensor:# `context_layer` = [B*F, N*H]context_layer = tf.reshape(context_layer,[batch_size * from_seq_length, num_attention_heads * size_per_head])else:# `context_layer` = [B, F, N*H]context_layer = tf.reshape(context_layer,[batch_size, from_seq_length, num_attention_heads * size_per_head])return context_layer
attention is all you need
参考代码attention-is-all-you-need-keras/transformer.py
# It's safe to use a 1-d mask for self-attentionclass ScaledDotProductAttention():def __init__(self, attn_dropout=0.1):self.dropout = Dropout(attn_dropout)def __call__(self, q, k, v, mask): # mask_k or mask_qktemper = tf.sqrt(tf.cast(tf.shape(k)[-1], dtype='float32'))attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/temper)([q, k]) # shape=(batch, q, k)if mask is not None:mmask = Lambda(lambda x:(-1e+9)*(1.-K.cast(x, 'float32')))(mask)attn = Add()([attn, mmask])attn = Activation('softmax')(attn)attn = self.dropout(attn)output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])return output, attnclass MultiHeadAttention():# mode 0 - big martixes, faster; mode 1 - more clear implementationdef __init__(self, n_head, d_model, dropout, mode=0):self.mode = modeself.n_head = n_headself.d_k = self.d_v = d_k = d_v = d_model // n_headself.dropout = dropoutif mode == 0:self.qs_layer = Dense(n_head*d_k, use_bias=False)self.ks_layer = Dense(n_head*d_k, use_bias=False)self.vs_layer = Dense(n_head*d_v, use_bias=False)elif mode == 1:self.qs_layers = []self.ks_layers = []self.vs_layers = []for _ in range(n_head):self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))self.attention = ScaledDotProductAttention()self.w_o = TimeDistributed(Dense(d_model))def __call__(self, q, k, v, mask=None):d_k, d_v = self.d_k, self.d_vn_head = self.n_headif self.mode == 0:qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]ks = self.ks_layer(k)vs = self.vs_layer(v)def reshape1(x):s = tf.shape(x) # [batch_size, len_q, n_head * d_k]x = tf.reshape(x, [s[0], s[1], n_head, s[2]//n_head])x = tf.transpose(x, [2, 0, 1, 3])x = tf.reshape(x, [-1, s[1], s[2]//n_head]) # [n_head * batch_size, len_q, d_k]return xqs = Lambda(reshape1)(qs)ks = Lambda(reshape1)(ks)vs = Lambda(reshape1)(vs)if mask is not None:mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)head, attn = self.attention(qs, ks, vs, mask=mask)def reshape2(x):s = tf.shape(x) # [n_head * batch_size, len_v, d_v]x = tf.reshape(x, [n_head, -1, s[1], s[2]])x = tf.transpose(x, [1, 2, 0, 3])x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]return xhead = Lambda(reshape2)(head)elif self.mode == 1:heads = []; attns = []for i in range(n_head):qs = self.qs_layers[i](q)ks = self.ks_layers[i](k)vs = self.vs_layers[i](v)head, attn = self.attention(qs, ks, vs, mask)heads.append(head); attns.append(attn)head = Concatenate()(heads) if n_head > 1 else heads[0]attn = Concatenate()(attns) if n_head > 1 else attns[0]outputs = self.w_o(head)outputs = Dropout(self.dropout)(outputs)return outputs, attn
