数据准备
制作mmap类型的预训练数据集。参考Megatron提供的数据处理脚本: https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py 按照如下的脚本提供的命令制作mmap数据集。
python preprocess_data.py \--input book_wiki_owtv2_small.json \--output-prefix gpt_small \--vocab gpt2-vocab.json \--dataset-impl mmap \--tokenizer-type GPT2BPETokenizer \--merge-file gpt2-merges.txt \--append-eod
编写用户端程序
导入依赖项
import torchfrom easynlp.appzoo.api import get_application_modelfrom easynlp.utils.global_vars import parse_user_defined_parametersfrom rapidformer import mpu, RapidformerEngine, get_args, PreTrainer, build_pretrain_huggingface_bert_datasets
创建EasyNLP预训练加速器
class EasyNLPRoBertaPreTrainer(PreTrainer):def __init__(self,engine):super().__init__(engine=engine)def train_valid_test_datasets_provider(self, train_val_test_num_samples):args = get_args()train_ds, valid_ds, test_ds = build_pretrain_huggingface_bert_datasets(data_prefix=args.data_path,data_impl=args.data_impl,splits_string=args.split,train_valid_test_num_samples=train_val_test_num_samples,max_seq_length=args.seq_length,masked_lm_prob=args.mask_prob,short_seq_prob=args.short_seq_prob,seed=args.seed,skip_warmup=(not args.mmap_warmup),binary_head=True)return train_ds, valid_ds, test_dsdef model_optimizer_lr_scheduler_provider(self):args = get_args()user_defined_parameters = parse_user_defined_parameters(args.user_defined_parameters)model = get_application_model(app_name=args.app_name,pretrained_model_name_or_path=args.pretrained_model_name_or_path,user_defined_parameters=user_defined_parameters)return model.backbone, None, Nonedef run_forward_step(self, data_iterator, model):# Items and their type.keys = ['input_ids', 'attention_mask', 'token_type_ids', 'labels', 'next_sentence_label']datatype = torch.int64# Broadcast data.if data_iterator is not None:data = next(data_iterator)else:data = Nonedata_b = mpu.broadcast_data(keys, data, datatype)input_ids = data_b['input_ids'].long()attention_mask = data_b['attention_mask'].long()token_type_ids = data_b['token_type_ids'].long()labels = data_b['labels'].long()output_tensor = model(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids, labels=labels)return output_tensor['loss']
定义Main函数
if __name__ == "__main__":engine = RapidformerEngine()trainer = EasyNLPRoBertaPreTrainer(engine=engine)trainer.train()
启动训练
python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_easynlp_bert.py \--app-name=language_modeling \--user-defined-parameters='pretrain_model_name_or_path=bert-base-uncased' \--task pretraining \--num-layers 12 \--hidden-size 768 \--num-attention-heads 12 \--micro-batch-size 16 \--global-batch-size 32 \--seq-length 512 \--tokenizer-type BertWordPieceLowerCase \--max-position-embeddings 512 \--train-iters 100 \--data-path $DATA_PATH \--vocab-file bert-en-uncased-vocab.txt \--split 980,20 \--distributed-backend nccl \--lr 1e-3 \--lr-decay-style linear \--min-lr 0.0 \--lr-decay-iters 2000 \--weight-decay 1e-2 \--clip-grad 1.0 \--lr-warmup-fraction .01 \--log-interval 1 \--mixed-precision \ #开启混合精度--onnx-runtime-training \ #开启计算图优化--fsdp-memory-optimization \ #开启显存优化--data-impl mmap \ #开启数据读取加速
