Sniper

Sniper

Model主要存放一些模型。比如Trm、Bert、T5等。

class Transformer()

class Transformer(object)

模型基类。所有Transformer based（Bert以及各种变种、T5等）的模型的基类。

def __init__(
    self,
    vocab_size,  # 词表大小
    hidden_size,  # 编码维度
    num_hidden_layers,  # Transformer总层数
    num_attention_heads,  # Attention的头数
    intermediate_size,  # FeedForward的隐层维度
    hidden_act,  # FeedForward隐层的激活函数
    dropout_rate=None,  # Dropout比例
    attention_dropout_rate=None, # Attention矩阵的Dropout比例(2021.09.13更新）
    embedding_size=None,  # 是否指定embedding_size
    attention_head_size=None,  # Attention中V的head_size
    attention_key_size=None,  # Attention中Q,K的head_size
    sequence_length=None,  # 是否固定序列长度
    keep_tokens=None,  # 要保留的词ID列表
    compound_tokens=None,  # 扩展Embedding
    residual_attention_scores=False,  # Attention矩阵加残差
    ignore_invalid_weights=False,  # 允许跳过不存在的权重
    layers=None,  # 外部传入的Keras层
    prefix=None,  # 层名前缀
    name=None,  # 模型名称
    **kwargs
)

大部分参数代码注释比较完善，需要格外说明的：

hierarchical默认为None，为True时为使用超长编码(利用层次分解，将bert（Transformer）的最长512的序列长度扩充为512*512，会损失一定精度，但是微调后可以使用很小的代价恢复性能) 苏神博客

residual_attention_scores是否使用残差Attention矩阵。残差Attention矩阵，给每个Attention矩阵加上前上一层的Attention矩阵，来源RealFormer论文,目前的实现可能还相对粗糙，欠缺通用性。

ignore_invalid_weights 为是否允许跳过名字不匹配的权重。默认为False，为True时，遇到名字不匹配的层名字时，会输出一个报错信息，但是程序并不会终止，改层的权重会随机初始化。

2021.09.13更新：新增支持attention 的dropout。PS：由于更新，下面的源码行数（&SOURCE）可能有一定的偏移，但是由于更新较小，偏移不大，基本就在行数下面几行的位置，因此不做更改。github地址

def build(self):

def build(
        self,
        attention_caches=None,
        layer_norm_cond=None,
        layer_norm_cond_hidden_size=None,
        layer_norm_cond_hidden_act=None,
        additional_input_layers=None,
        **kwargs
    ):

attention_caches 为Attention的K,V的缓存序列字典，格式为{Attention层名: [K缓存, V缓存]}；

layer_norm_*系列参数：实现Conditional Layer Normalization 时使用，用来实现以“固定长度向量”为条件的条件Bert。该方法通过在LN层加入一个方向的扰动，从而可以在一个模型中完成多个类似的任务，比如在一个模型中生成积极的文本和消极的文本、在一个模型中进行短短文本匹配，短长文本匹配等。详见苏神博客

additional_input_layers为除Bert原生输入外其余的输入项。通过self.set_inputs()来添加到模型中。

def call(self):

def call(self, inputs):
    """定义模型的执行流程
    """
    # Embedding
    outputs = self.apply_embeddings(inputs)
    # Main
    for i in range(self.num_hidden_layers):
        outputs = self.apply_main_layers(outputs, i)
    # Final
    outputs = self.apply_final_layers(outputs)
    return outputs

call方法可以看出来，整体来说，是embedding、main layers（Transformer）、final layers（dense）。

def set_inputs(self):

def set_inputs(self, inputs, additional_input_layers=None):
    """设置input和inputs属性
    """
    if inputs is None:
        inputs = []
    elif not isinstance(inputs, list):
        inputs = [inputs]
    inputs = inputs[:]
    if additional_input_layers is not None:
        if not isinstance(additional_input_layers, list):
            additional_input_layers = [additional_input_layers]
        inputs.extend(additional_input_layers)
    self.inputs = inputs
    if len(inputs) > 1:
        self.input = inputs
    else:
        self.input = inputs[0]

set_inputs方法可以看出来如何添加的additional_input_layers，同时处理input参数。（input/inputs区分一下，我研究半天这是干嘛的，后来发现不一样，如果你观察过bert4keras 的模型你就会发现有input和inputs两个变量）。

def load_embeddings(self):

def load_embeddings(self, embeddings):
    """处理Embedding层权重
    """
    embeddings = embeddings.astype(K.floatx())  # 防止np.average报错
    if self.keep_tokens is not None:
        embeddings = embeddings[self.keep_tokens]
    if self.compound_tokens is not None:
        ext_embeddings = []
        for item in self.compound_tokens:
            if isinstance(item, list):
                item = (item, [1] * len(item))
            ext_embeddings.append(
                np.average(embeddings[item[0]], 0, item[1])
            )
        embeddings = np.concatenate([embeddings, ext_embeddings], 0)
    return embeddings

load_embedding分别对应的缩小embedding（keep_token）和扩大embedding(compound_token)两种情况。

前者用于不需要这么多token（比如bert4keras默认的精简方式详见参数simplified ），只需要将embedding对应部分截取出来就行。后者对应需要更多的token，直接在embedding中添加新的行（axis=0）就行了。

def compute_attention_bias(self)

def compute_attention_bias(self, inputs=None):
    """定义每一层的Attention Bias
    """
    return self.attention_bias

这个方法主要是计算attention的mask（或者bias）比如在LM_MASK以及UniLM_Mask 中复写的compute_attention_bias，用于相关用途（在attention阶段添加mask[比如LM中的随机Mask]或bias[比如NEZHA在attention中添加相对位置编码]）。

Transformer

Sniper

class Transformer()

def build(self):

def call(self):

def set_inputs(self):

def load_embeddings(self):

def compute_attention_bias(self)

class LM_Mask()

class UniLM_Mask()

class BERT()

def apply_embeddings(self):

def apply_main_layers(self):

class ALBERT()

def apply_main_layers(self):

class ALBERT_Unshared()

class NEZHA()

def apply_embeddings(self):

def compute_position_bias(self):

def apply_main_layers(self):

class RoFormer()

class ELECTRA()

class GPT()

def apply_embedding()

class GPT2()

def get_inputs()

def apply_embeddings()

def apply_main_layers()

def apply_final_layers()

class GPT2_ML()

class T5_Base()

class T5_Encoder()

def apply_embeddings()

def apply_main_layers()

def compute_position_bias()

class T5_Decoder()

def apply_embeddings()

def apply_main_layers()

def compute_position_bias()

class T5()

def extend_with_language_model()

def extend_with_unified_language_model()

def build_transformer_model()