数据准备

制作mmap类型的预训练数据集。参考Megatron提供的数据处理脚本: https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py 按照如下的脚本提供的命令制作mmap数据集。

  1. python preprocess_data.py \
  2. --input book_wiki_owtv2_small.json \
  3. --output-prefix gpt_small \
  4. --vocab gpt2-vocab.json \
  5. --dataset-impl mmap \
  6. --tokenizer-type GPT2BPETokenizer \
  7. --merge-file gpt2-merges.txt \
  8. --append-eod

编写用户端程序

导入依赖项

  1. import torch
  2. from easynlp.appzoo.api import get_application_model
  3. from easynlp.utils.global_vars import parse_user_defined_parameters
  4. from rapidformer import mpu, RapidformerEngine, get_args, PreTrainer, build_pretrain_huggingface_bert_datasets

创建EasyNLP预训练加速器

  1. class EasyNLPRoBertaPreTrainer(PreTrainer):
  2. def __init__(self,engine):
  3. super().__init__(engine=engine)
  4. def train_valid_test_datasets_provider(self, train_val_test_num_samples):
  5. args = get_args()
  6. train_ds, valid_ds, test_ds = build_pretrain_huggingface_bert_datasets(
  7. data_prefix=args.data_path,
  8. data_impl=args.data_impl,
  9. splits_string=args.split,
  10. train_valid_test_num_samples=train_val_test_num_samples,
  11. max_seq_length=args.seq_length,
  12. masked_lm_prob=args.mask_prob,
  13. short_seq_prob=args.short_seq_prob,
  14. seed=args.seed,
  15. skip_warmup=(not args.mmap_warmup),
  16. binary_head=True)
  17. return train_ds, valid_ds, test_ds
  18. def model_optimizer_lr_scheduler_provider(self):
  19. args = get_args()
  20. user_defined_parameters = parse_user_defined_parameters(args.user_defined_parameters)
  21. model = get_application_model(app_name=args.app_name,
  22. pretrained_model_name_or_path=args.pretrained_model_name_or_path,
  23. user_defined_parameters=user_defined_parameters)
  24. return model.backbone, None, None
  25. def run_forward_step(self, data_iterator, model):
  26. # Items and their type.
  27. keys = ['input_ids', 'attention_mask', 'token_type_ids', 'labels', 'next_sentence_label']
  28. datatype = torch.int64
  29. # Broadcast data.
  30. if data_iterator is not None:
  31. data = next(data_iterator)
  32. else:
  33. data = None
  34. data_b = mpu.broadcast_data(keys, data, datatype)
  35. input_ids = data_b['input_ids'].long()
  36. attention_mask = data_b['attention_mask'].long()
  37. token_type_ids = data_b['token_type_ids'].long()
  38. labels = data_b['labels'].long()
  39. output_tensor = model(input_ids=input_ids, attention_mask=attention_mask,
  40. token_type_ids=token_type_ids, labels=labels)
  41. return output_tensor['loss']

定义Main函数

  1. if __name__ == "__main__":
  2. engine = RapidformerEngine()
  3. trainer = EasyNLPRoBertaPreTrainer(engine=engine)
  4. trainer.train()

启动训练

  1. python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_easynlp_bert.py \
  2. --app-name=language_modeling \
  3. --user-defined-parameters='pretrain_model_name_or_path=bert-base-uncased' \
  4. --task pretraining \
  5. --num-layers 12 \
  6. --hidden-size 768 \
  7. --num-attention-heads 12 \
  8. --micro-batch-size 16 \
  9. --global-batch-size 32 \
  10. --seq-length 512 \
  11. --tokenizer-type BertWordPieceLowerCase \
  12. --max-position-embeddings 512 \
  13. --train-iters 100 \
  14. --data-path $DATA_PATH \
  15. --vocab-file bert-en-uncased-vocab.txt \
  16. --split 980,20 \
  17. --distributed-backend nccl \
  18. --lr 1e-3 \
  19. --lr-decay-style linear \
  20. --min-lr 0.0 \
  21. --lr-decay-iters 2000 \
  22. --weight-decay 1e-2 \
  23. --clip-grad 1.0 \
  24. --lr-warmup-fraction .01 \
  25. --log-interval 1 \
  26. --mixed-precision \ #开启混合精度
  27. --onnx-runtime-training \ #开启计算图优化
  28. --fsdp-memory-optimization \ #开启显存优化
  29. --data-impl mmap \ #开启数据读取加速