NLP训练&落地的标准流程

EasyNLP深入实践 - 图1

代码示例

数据下载

  1. if [ ! -f ./nlu_train.csv ]; then
  2. wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/knowledge_nlu/nlu_train.csv
  3. fi
  4. if [ ! -f ./nlu_dev.csv ]; then
  5. wget https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com/release/tutorials/knowledge_nlu/nlu_dev.csv
  6. fi

标准Finetune测试脚本

  1. easynlp \
  2. --mode=train \
  3. --worker_count=1 \
  4. --worker_gpu=1 \
  5. --tables=nlu_train.csv,nlu_dev.csv \
  6. --input_schema=label:str:1,text1:str:1,text2:str:1 \
  7. --first_sequence=text1 \
  8. --second_sequence=text2 \
  9. --label_name=label \
  10. --label_enumerate_values=0,1 \
  11. --checkpoint_dir=./base_model \
  12. --learning_rate=3e-5 \
  13. --epoch_num=3 \
  14. --random_seed=42 \
  15. --save_checkpoint_steps=200 \
  16. --sequence_length=128 \
  17. --micro_batch_size=32 \
  18. --app_name=text_match \
  19. --user_defined_parameters="pretrain_model_name_or_path=bert-base-chinese"

知识增强预训练测试脚本

  1. easynlp \
  2. --mode=train \
  3. --worker_count=1 \
  4. --worker_gpu=1 \
  5. --tables=nlu_train.csv,nlu_dev.csv \
  6. --input_schema=label:str:1,text1:str:1,text2:str:1 \
  7. --first_sequence=text1 \
  8. --second_sequence=text2 \
  9. --label_name=label \
  10. --label_enumerate_values=0,1 \
  11. --checkpoint_dir=./dkplm_model \
  12. --learning_rate=3e-5 \
  13. --epoch_num=3 \
  14. --random_seed=42 \
  15. --save_checkpoint_steps=200 \
  16. --sequence_length=128 \
  17. --micro_batch_size=32 \
  18. --app_name=text_match \
  19. --user_defined_parameters="pretrain_model_name_or_path=alibaba-pai/pai-dkplm-medical-base-zh"

知识蒸馏测试脚本:

  1. # forward teacher logits
  2. easynlp \
  3. --mode=predict \
  4. --worker_count=1 \
  5. --worker_gpu=1 \
  6. --tables=nlu_train.csv \
  7. --outputs=logits_pred.csv \
  8. --input_schema=label:str:1,text1:str:1,text2:str:1 \
  9. --output_schema=logits \
  10. --first_sequence=text1 \
  11. --second_sequence=text2 \
  12. --checkpoint_path=./dkplm_model \
  13. --micro_batch_size=32 \
  14. --sequence_length=128 \
  15. --app_name=text_match
  16. easynlp \
  17. --app_name=data_augmentation \
  18. --worker_count=1 \
  19. --worker_gpu=1 \
  20. --mode=predict \
  21. --tables=nlu_train.csv \
  22. --input_schema=label:str:1,text1:str:1,text2:str:1 \
  23. --first_sequence=text1 \
  24. --second_sequence=text2 \
  25. --label_name =label \
  26. --outputs=nlu_aug.tsv \
  27. --output_schema=augmented_data \
  28. --checkpoint_dir=_ \
  29. --sequence_length=128 \
  30. --micro_batch_size=32 \
  31. --user_defined_parameters="
  32. pretrain_model_name_or_path=alibaba-pai/pai-dkplm-medical-base-zh
  33. type=mlm_da
  34. expansion_rate=10
  35. mask_proportion=0.1
  36. remove_blanks=True
  37. "
  38. # train student

小样本学习测试脚本

详见:https://www.yuque.com/easyx/easynlp/ochmnf