1. 预训练模型列表

目前EasyTexMiner支持的完整的预训练模型列表如下。

Model Parameters
RoBERTa
hit-roberta-base-zh L=12,H=768,A=12
hit-roberta-large-zh L=24,H=1024,A=16
cro-roberta-tiny-zh L=4,H=312,A=12
brightmart-roberta-small-zh L=6, H=768,A=12
brightmart-roberta-large-zh L=24,H=1024,A=16
ALBERT
google-albert-base-zh/en L=12,H=768,A=12
google-albert-large-zh/en L=12,H=1024,A=16
google-albert-xlarge-zh/en L=24,H=2048,A=32
google-albert-xxlarge-zh/en L=12,H=4096,A=64
pai-albert-base-zh/en L=12,H=768,A=12
pai-albert-large-zh/en L=12,H=1024,A=16
pai-albert-xlarge-zh/en L=24,H=2048,A=32
pai-albert-xxlarge-zh/en L=12,H=4096,A=64
BERT
google-bert-base-zh L=12,H=768,A=12
google-bert-base-en L=12,H=768,A=12
google-bert-large-en L=24,H=1024,A=16
google-bert-small-en L=6,H=768,A=12
google-bert-tiny-en L=2,H=128,A=2
pai-bert-large-zh L=24,H=1024,A=16
pai-bert-base-zh L=12,H=768,A=12
pai-bert-small-zh L=6,H=768,A=12
pai-bert-tiny-zh L=2,H=128,A=2
pai-bert-tiny-zh-L2-H768-A12 L=2,H768,A=12
Cross-modal
icbu-imagebert-small-en L=6,H=128,A=2
pai-imagebert-base-en L=12,H=768,A=12
pai-imagebert-base-zh L=12,H=768,A=12

2. 下游任务Finetune预训练模型

基于以上的预训练模型,可以在下游任务里finetune预训练模型。下面以BERT文本分类为例:

  1. from easytexminer import modules, losses
  2. from easytexminer.applications import get_application_predictor
  3. from easytexminer.core import Evaluator, PredictorManager
  4. from easytexminer.core.trainer import Trainer
  5. from easytexminer.data import BertClassificationDataset
  6. from easytexminer.utils import config, init_running_envs, get_dir_name, get_pretrain_model_path, distributed_call_main
  7. from easytexminer import model_zoo
  8. class BertTextClassify(modules.nn.BaseModel):
  9. """ BERT Classification/Regression Teacher """
  10. base_model_prefix = "bert"
  11. def __init__(self, config, **kwargs):
  12. super(BertTextClassify, self).__init__(config)
  13. self.model_name = cfg.model_name
  14. self.bert = model_zoo.get_pretrained_model(cfg.pretrain_model_name_or_path,
  15. model_name=cfg.model_name,
  16. num_labels=kwargs.pop("num_labels", None))
  17. def forward(self, inputs):
  18. logits, _, _ = self.bert(inputs["input_ids"],
  19. inputs["segment_ids"],
  20. inputs["input_mask"])
  21. return {
  22. "logits": logits
  23. }
  24. def compute_loss(self, model_outputs, inputs):
  25. logits = model_outputs["logits"]
  26. label_ids = inputs["label_ids"]
  27. return {
  28. "loss": losses.cross_entropy(logits, label_ids)
  29. }
  30. def main_fn(gpu, cfg, *args, **kwargs):
  31. # Prepare seed / logging / gpu environment
  32. init_running_envs(gpu, cfg)
  33. print("User Defined Example.")
  34. if cfg.mode == "predict":
  35. predictor = get_application_predictor(
  36. model_type=cfg.model_name, model_dir=cfg.checkpoint_dir,
  37. first_sequence=cfg.first_sequence,
  38. second_sequence=cfg.second_sequence,
  39. sequence_length=cfg.sequence_length)
  40. predictor_manager = PredictorManager(
  41. predictor=predictor,
  42. input_file=cfg.tables.split(",")[-1],
  43. input_schema=cfg.input_schema,
  44. output_file=cfg.outputs,
  45. output_schema=cfg.output_schema,
  46. append_cols=cfg.append_cols,
  47. batch_size=cfg.batch_size
  48. )
  49. predictor_manager.run()
  50. exit()
  51. import os
  52. vocab_file = os.path.join(get_dir_name(get_pretrain_model_path(cfg.pretrain_model_name_or_path)), "vocab.txt")
  53. pretrain_model_name_or_path = cfg.pretrain_model_name_or_path if cfg.pretrain_model_name_or_path \
  54. else cfg.checkpoint_dir
  55. valid_dataset = BertClassificationDataset(
  56. model_type="text_classify_bert",
  57. data_file=cfg.tables.split(",")[-1],
  58. vocab_file=vocab_file,
  59. max_seq_length=cfg.sequence_length,
  60. input_schema=cfg.input_schema,
  61. first_sequence=cfg.first_sequence,
  62. second_sequence=cfg.second_sequence,
  63. label_name=cfg.label_name,
  64. label_enumerate_values=cfg.label_enumerate_values,
  65. is_training=False)
  66. model = BertTextClassify.from_pretrained(
  67. pretrained_model_name_or_path=pretrain_model_name_or_path,
  68. num_labels=len(valid_dataset.label_enumerate_values))
  69. if cfg.mode == "train":
  70. # Build Data Loader
  71. train_dataset = BertClassificationDataset(model_type="text_classify_bert",
  72. data_file=cfg.tables.split(",")[0],
  73. vocab_file=vocab_file,
  74. max_seq_length=cfg.sequence_length,
  75. input_schema=cfg.input_schema,
  76. first_sequence=cfg.first_sequence,
  77. second_sequence=cfg.second_sequence,
  78. label_name=cfg.label_name,
  79. label_enumerate_values=cfg.label_enumerate_values,
  80. is_training=True)
  81. # Training
  82. trainer = Trainer(model=model, train_dataset=train_dataset, valid_dataset=valid_dataset, cfg=cfg)
  83. trainer.train()
  84. elif cfg.mode == "evaluate":
  85. evaluator = Evaluator(metrics=valid_dataset.eval_metrics)
  86. evaluator.evaluate(model=model, valid_dataset=valid_dataset, eval_batch_size=cfg.eval_batch_size)
  87. if __name__ == "__main__":
  88. parser = config.add_basic_argument()
  89. cfg = parser.parse_args()
  90. distributed_call_main(main_fn=main_fn, cfg=cfg)

完整命令

export CUDA_VISIBLE_DEVICES=$1

if [ ! -f ./train.tsv ]; then
  wget http://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/easytexminer/tutorials/classification/train.tsv
fi

if [ ! -f ./dev.tsv ]; then
  wget http://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/easytexminer/tutorials/classification/dev.tsv
fi

python main.py \
  --mode train \
  --tables=train.tsv,dev.tsv \
  --input_schema=label:str:1,sid1:str:1,sid2:str:1,sent1:str:1,sent2:str:1 \
  --first_sequence=sent1 \
  --second_sequence=sent2 \
  --label_name=label \
  --label_enumerate_values=0,1 \
  --pretrain_model_name_or_path=/home/jerry.lp/easy_nlp/EasyTexMiner/scripts/classification/bert-base-uncased \
  --checkpoint_dir=./classification_model/ \
  --learning_rate=3e-5  \
  --epoch_num=3  \
  --logging_steps=100 \
  --save_checkpoint_steps=50 \
  --sequence_length=128 \
  --train_batch_size=32 \
  --model_name=text_classify_bert \

运行结果

3. 基于业务数据Continue Pretrain

在很多业务场景里,特别是有大量未标注数据的场景里,如果能做continual pretrain,效果会大大提升。下面以RoBERTa文本分类为例。

数据准备

业务数据集准备,JSON格式如下, text属性值对应的是数组,数组里面存放的是每个句子。句子是否分词均可。

{"text": ["韦尔斯 ( ) , 英国 国会 一 郡 选区 , 位于 英格兰 西南部 萨默塞 特郡 东北部 , 包括 了 门迪普 西部 和 塞奇穆尔 北部 。", "本区 始设 于 1295年 , 1868年 被 >撤 前 为 应 选 两 席 的 自治市 选区 。", "1885年 恢复 以来 , 多数 支持 保守党 , 自由党 — 自民党 历史 只有 三 次 当选 纪录 。", "2010年 大选 中 自民党 的 泰莎·孟特 以 44
4.0% 选票 、 800 之 差 胜 出 该区 首 次 当选 。"]}

代码示例

from easytexminer import modules
from easytexminer.applications import get_application_predictor
from easytexminer.core import Evaluator, PredictorManager
from easytexminer.core.trainer import Trainer
from easytexminer.data import get_dataset
from easytexminer.utils import config, init_running_envs, get_dir_name, get_pretrain_model_path, distributed_call_main
from easytexminer import model_zoo
from easytexminer.losses import cross_entropy

class RoBertaTextPretrain(modules.nn.BaseModel):
    """ RoBERTa Pretraining """
    base_model_prefix = "bert"
    def __init__(self, config, **kwargs):
        super(RoBertaTextPretrain, self).__init__(config)
        self.model_name = cfg.model_name
        self.bert = model_zoo.get_pretrained_model(cfg.pretrain_model_name_or_path,
                                                   model_name=cfg.model_name)

    def forward(self, inputs):
        prediction_scores = self.bert(input_ids=inputs["input_ids"],
                                      attention_mask=inputs["input_mask"])
        return {
            "logits": prediction_scores
        }

    def compute_loss(self, model_outputs, inputs):
        prediction_scores = model_outputs["logits"]
        masked_lm_labels = inputs["label_ids"]
        masked_lm_loss = cross_entropy(
            prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
        return {"loss": masked_lm_loss}

def main_fn(gpu, cfg, *args, **kwargs):
    # Prepare seed / logging / gpu environment
    init_running_envs(gpu, cfg)
    print("User Defined Example.")

    if cfg.mode == "predict":
        predictor = get_application_predictor(
            model_type=cfg.model_name, model_dir=cfg.checkpoint_dir,
            first_sequence=cfg.first_sequence,
            second_sequence=cfg.second_sequence,
            sequence_length=cfg.sequence_length)
        predictor_manager = PredictorManager(
            predictor=predictor,
            input_file=cfg.tables.split(",")[-1],
            input_schema=cfg.input_schema,
            output_file=cfg.outputs,
            output_schema=cfg.output_schema,
            append_cols=cfg.append_cols,
            batch_size=cfg.batch_size
        )
        predictor_manager.run()
        exit()

    import os
    vocab_file = os.path.join(get_dir_name(get_pretrain_model_path(cfg.pretrain_model_name_or_path)), "vocab.txt")
    pretrain_model_name_or_path = cfg.pretrain_model_name_or_path if cfg.pretrain_model_name_or_path \
        else cfg.checkpoint_dir

    valid_dataset = get_dataset(model_type=cfg.model_name,
                                data_file=cfg.tables.split(",")[-1],
                                vocab_file=vocab_file,
                                max_seq_length=cfg.sequence_length,
                                is_training=False)


    model = RoBertaTextPretrain.from_pretrained(
        pretrained_model_name_or_path=pretrain_model_name_or_path)

    if cfg.mode == "train":
        # Build Data Loader
        train_dataset = get_dataset(model_type=cfg.model_name,
                                    data_file=cfg.tables.split(",")[0],
                                    vocab_file=vocab_file,
                                    max_seq_length=cfg.sequence_length,
                                    is_training=True)
        # Training
        trainer = Trainer(model=model,
                          train_dataset=train_dataset,
                          valid_dataset=valid_dataset,
                          cfg=cfg)
        trainer.train()
    elif cfg.mode == "evaluate":
        evaluator = Evaluator(metrics=valid_dataset.eval_metrics)
        evaluator.evaluate(model=model, valid_dataset=valid_dataset, eval_batch_size=cfg.eval_batch_size)

if __name__ == "__main__":
    parser = config.add_basic_argument()
    cfg = parser.parse_args()

    distributed_call_main(main_fn=main_fn, cfg=cfg)

完整命令

#! /bin/bash
export CUDA_VISIBLE_DEVICES=$1

if [ ! -f ./train.json ]; then
  wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/easytexminer/tutorials/language_modeling/train.json
fi

if [ ! -f ./dev.json ]; then
  wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/easytexminer/tutorials/language_modeling/dev.json
fi

if [ ! -f ./hfl-chinese-roberta-wwm-ext.tgz ]; then
  wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/easytexminer/tutorials/language_modeling/hfl-chinese-roberta-wwm-ext.tgz
  tar -zxf hfl-chinese-roberta-wwm-ext.tgz
fi

python main.py \
    --mode=train \
    --tables=train.json,dev.json \
    --learning_rate=5e-4  \
    --epoch_num=1  \
    --logging_steps=10 \
    --save_checkpoint_steps=20 \
    --sequence_length=128 \
    --train_batch_size=2 \
    --pretrain_model_name_or_path=$PWD/hfl-chinese-roberta-wwm-ext \
    --model_name=language_modeling_bert \
    --checkpoint_dir=./lm_models

运行结果

4. ModelZoo预测

ModelZoo的预测可以参考模型预测文档:https://www.yuque.com/easytransfer/easytm/kfyezb