GPU训练

方式1:找到三个变量调用.cuda()

  1. 模型
  2. loss
  3. 数据 imgs, target = data

image.png

  1. mymodule = Mymodule()
  2. # ========================cuda======================================
  3. if torch.cuda.is_available():
  4. mymodule = mymodule.cuda()
  5. # 3 损失函数和优化器
  6. loss_fn = nn.CrossEntropyLoss()
  7. # ========================cuda======================================
  8. if torch.cuda.is_available():
  9. loss_fn = loss_fn.cuda()
  10. with torch.no_grad():
  11. for data in test_dataloader:
  12. imgs, target = data
  13. # ========================cuda======================================
  14. if torch.cuda.is_available():
  15. imgs = imgs.cuda()
  16. target = target.cuda()

方式2:to(device)

  1. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

image.png

  1. # 定义训练的设备!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  2. # device = torch.device("cpu")
  3. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  4. print("设备:", device)
  5. mymodule = Mymodule()
  6. # ========================cuda======================================
  7. # if torch.cuda.is_available():
  8. # mymodule = mymodule.cuda()
  9. mymodule = mymodule.to(device) # 也可以不用赋值 直接mymodule.to(device)
  10. # 3 损失函数和优化器
  11. loss_fn = nn.CrossEntropyLoss()
  12. # ========================cuda======================================
  13. # if torch.cuda.is_available():
  14. # loss_fn = loss_fn.cuda()
  15. loss_fn = loss_fn.to(device) # 也可以不用赋值 直接loss_fn.to(device)
  16. # 训练步骤开始
  17. mymodule.train() # 非必要
  18. for data in train_dataloader:
  19. imgs, target = data
  20. # # ========================cuda======================================
  21. # if torch.cuda.is_available():
  22. # imgs = imgs.cuda()
  23. # target = target.cuda()
  24. imgs = imgs.to(device) # 必须这种
  25. target = target.to(device) # 必须这种

配置多GPU

  1. import os
  2. # 配置gpu
  3. # os.environ["CUDA_VISIBLE_DEVICES"] = "5,6,7"
  4. # os.environ["CUDA_VISIBLE_DEVICES"] = "0"

torch.distributed.lanuch (shell脚本运行)

  1. #!/usr/bin/env bash
  2. CONFIG=$1
  3. GPUS=$2
  4. PORT=${PORT:-29500}
  5. PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
  6. python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
  7. $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
  8. # 命令 torch.distributed.launch python -m torch.distributed.launch main.py

https://zhuanlan.zhihu.com/p/86441879

python -m torch.distributed.launch --nproc_per_node=4 train.py --a b --c d
  • 其中python -m torch.distributed.lanuch表示调用torch.distributed.lanuch.py文件来进行分布式训练
  • -m表示将后面的torch.distributed.lanuch当做模块加载。
  • —nproc_per_node通常与GPU数量保持一致。
  • train.py才是真正的训练文件,后面是相关参数。