CPU与GPU
数据迁移至GPU
多GPU并行运算

GPU 与CPU

CPU（Central Processing Unit, 中央处理器）：主要包括控制器和运算器
GPU(Graphics Processing Unit, 图形处理器)：处理统一的，无依赖的大规模数据运算

数据迁移

数据必须在同一处理器，才能被处理

to函数

转换数据类型/设备

tensor.to(args, *kwargs)
module.to(args, *kwargs)

区别：张量不执行inplace，模型执行inplace

x = torch.ones((3, 3))
x = x.to(torch.float64)  # float32 -> float64
x = torch.ones((3, 3))
x = x.to("cuda")  # 数据迁移cpu -> cuda
linear = nn.Linear(2, 2)
linear.to(torch.double) 
gpu1 = torch.device("cuda")
linear.to(gpu1)  # module cpu -> cuda


import torch
import torch.nn as nn
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# ========================== 02_tensor to cuda ========================== #
# flag = 0
flag = 1
if flag:
    x_cpu = torch.ones((3, 3))
    print("x_cpu:\ndevice: {} is_cuda: {} id: {}".format(x_cpu.device, x_cpu.is_cuda, id(x_cpu)))
    x_gpu = x_cpu.to(device)
    print("x_gpu:\ndevice: {} is_cuda: {} id: {}".format(x_gpu.device, x_gpu.is_cuda, id(x_gpu)))
# 弃用
# x_gpu = x_cpu.cuda()  # 0.4.0之前的方法
# ========================== module to cuda ========================== #
# flag = 0
flag = 1
if flag:
    net = nn.Sequential(nn.Linear(3, 3))
    print("\nid:{} is_cuda: {}".format(id(net), next(net.parameters()).is_cuda))
    net.to(device)
    # 内存地址不变
    print("\nid:{} is_cuda: {}".format(id(net), next(net.parameters()).is_cuda))
# ========================== forward in cuda ========================== #
# flag = 0
flag = 1
if flag:
    output = net(x_gpu)
    print("output is_cuda: {}".format(output.is_cuda))
    # output = net(x_cpu)  # 程序报错，数据在CPU上，模型在GPU上
# ========================== 查看当前gpu 序号，尝试修改可见gpu，以及主gpu ========================== #
flag = 0
# flag = 1
if flag:
    current_device = torch.cuda.current_device()
    print("current_device: ", current_device)
    torch.cuda.set_device(0)
    current_device = torch.cuda.current_device()
    print("current_device: ", current_device)
    #
    cap = torch.cuda.get_device_capability(device=None)
    print(cap)
    #
    name = torch.cuda.get_device_name()
    print(name)
    is_available = torch.cuda.is_available()
    print(is_available)
    # ===================== seed ========================== #
    seed = 2
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    current_seed = torch.cuda.initial_seed()
    print(current_seed)
    s = torch.cuda.seed()
    s_all = torch.cuda.seed_all()

torch.cuda常用方法

torch.cuda.device_count()：计算当前可见可用gpu数
torch.cuda.get_device_name()：获取gpu名称
torch.cuda.manual_seed()：为当前gpu设置随机种子
torch.cuda.manual_seed_all()：为所有可见可用gpu设置随机种子
torch.cuda.set_device()：设置主gpu为哪一个物理gpu（不推荐）
推荐： os.environ.setdefault(“CUDA_VISIBLE_DEVICES”, “2, 3”)

**
我们首先要理解什么叫物理GPU，什么叫逻辑GPU。

物理GPU：实实在在插在我们的主机上的显卡
逻辑GPU：是python脚本当中可见的那些gpu。逻辑GPU的数量一定是小于等于物理GPU的数量。

那么应该如何去理解黄色代码所设置的2号3号GPU呢？这里我们知道物理GPU是永远不会变的，逻辑GPU会变。这里我们设置2和3是可见的。因此我们的逻辑GPU中只有两个gpu，所以逻辑GPU中后面的GPU2 、GPU3两个gpu就不存在了。此时逻辑GPU只有GPU0和GPU1。逻辑GPU0对应物理GPU中的2号，逻辑GPU1对应物理GPU3。
新建坚果云绘图.png

假如现在我们设置如下代码，我们来思考逻辑GPU又是怎么对应的。

os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0, 3, 2")

新建坚果云绘图.png
分配过程如图所示。我们为什么要这样煞费苦心的去设置这一个环境变量呢？可以灵活的分配GPU资源。在逻辑GPU当中我们通常有一个主gpu的概念，通常呢我们默认为第0个gpu他就是主gpu。

为什么要跟提总GPU这个概念呢？这与多GPU的运算的分发背景有关。下面我们就来学习拍拓展当中的。

多GPU并行运算

以小明做作业为例：

假设一个小朋友做一份作业需要60min，一共4分作业，总共需要240min才能完成。

如果他找其他三个小伙伴帮忙做作业，找到这些小伙伴需要3min，在平均分发给小伙伴做作业。当所有作业完成之后，小明需要对所有作业进行回收审核，回收审核需要话费3min时间。所以总共花费3 + 60 + 3 = 66min就可以完成全部作业。

`torch.nn.DataParallel()`

torch.nn.DataParallel(
    module,
    device_ids=None,
    output_device=None,
    dim=0)

功能：包装模型，实现分发并行机制
主要参数：
- module : 需要包装分发的模型
- device_ids : 可分发的gpu，默认分发到所有可见可用gpu
- output_device : 结果输出设备

查询当前gpu内存剩余

def get_gpu_memory():
    import os
    os.system('nvidia-smi -q -d Memory | grep -A4 GPU | grep Free > tmp.txt')
    memory_gpu = [int(x.split()[2]) for x in open('tmp.txt', 'r').readlines()]
    os.system('rm tmp.txt')
    return memory_gpu

# example:
gpu_memory = get_gpu_memory()
gpu_list = np.argsort(gpu_memory)[::-1]  # 排序
gpu_list_str = ','.join(map(str, gpu_list))
os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
print("\ngpu free memory: {}".format(gpu_memory))
print("CUDA_VISIBLE_DEVICES :{}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
## 输出
# gpu free memory: [10362, 10058, 9990, 9990]
# CUDA_VISIBLE_DEVICES: 0,1,3,2

# -*- coding: utf-8 -*-
import os
import numpy as np
import torch
import torch.nn as nn
# ============================ 手动选择gpu
# flag = 0
flag = 1
if flag:
    gpu_list = [0]
    gpu_list = [0, 1, 2, 3]
    gpu_list_str = ','.join(map(str, gpu_list))
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ============================ 依内存情况自动选择主gpu
# flag = 0
flag = 1
if flag:
    def get_gpu_memory():
        import platform
        if 'Windows' != platform.system():
            import os
            os.system('nvidia-smi -q -d Memory | grep -A4 GPU | grep Free > tmp.txt')
            memory_gpu = [int(x.split()[2]) for x in open('tmp.txt', 'r').readlines()]
            os.system('rm tmp.txt')
        else:
            memory_gpu = False
            print("显存计算功能暂不支持windows操作系统")
        return memory_gpu
    gpu_memory = get_gpu_memory()
    if not gpu_memory:
        print("\n14_gpu_use free memory: {}".format(gpu_memory))
        gpu_list = np.argsort(gpu_memory)[::-1]
        gpu_list_str = ','.join(map(str, gpu_list))
        os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class FooNet(nn.Module):
    def __init__(self, neural_num, layers=3):
        super(FooNet, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
    def forward(self, x):
        print("\nbatch size in forward: {}".format(x.size()[0]))
        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)
        return x
if __name__ == "__main__":
    batch_size = 16
    # data
    inputs = torch.randn(batch_size, 3)
    labels = torch.randn(batch_size, 3)
    inputs, labels = inputs.to(device), labels.to(device)
    # model
    net = FooNet(neural_num=3, layers=3)
    net = nn.DataParallel(net)  # 多GPU分发
    net.to(device)
    # training
    for epoch in range(1):
        outputs = net(inputs)
        print("model outputs.size: {}".format(outputs.size()))
    print("CUDA_VISIBLE_DEVICES :{}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
    print("device_count :{}".format(torch.cuda.device_count()))

gpu模型加载报错

报错一

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU -only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

在GPU上训练并保存模型之后，然后将该模型放到CPU的机器上加载，会报错
解决：

torch.load(path_state_dict, map_location="cpu")  # 将map_location 设置为CPU

报错二

RuntimeError: Error(s) in loading state_dict for FooNet: Missing key(s) in state_dict: "linears.0.weight", "linears.1.weight", "linears.2.weight". Unexpected key(s) in state_dict: "module.linears.0.weight", "module.linears.1.weight", "module.linears.2.weight".

多GPU并行运算训练之后，模型被 DataParallel 进行包装，使得模型的网络层命名会多一个 module 前缀。所以导致加载 state_dict 的时候，导致字典的 keys 不匹配，所以报错 Missing key(s)

解决：

from collections import OrderedDict
new_state_dict = OrderedDict()  # 构建新的OrderDict
for k, v in state_dict_load.items():
    namekey = k[7:] if k.startswith('module.') else k 
    new_state_dict[namekey] = v

import os
import numpy as np
import torch
import torch.nn as nn


class FooNet(nn.Module):
    def __init__(self, neural_num, layers=3):
        super(FooNet, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])

    def forward(self, x):

        print("\nbatch size in forward: {}".format(x.size()[0]))

        for (i, linear) in enumerate(self.linears):
            x = linear(x)
            x = torch.relu(x)
        return x


# =================================== 加载至cpu
flag = 0
# flag = 1
if flag:
    gpu_list = [0]
    gpu_list_str = ','.join(map(str, gpu_list))
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    net = FooNet(neural_num=3, layers=3)
    net.to(device)

    # save
    net_state_dict = net.state_dict()
    path_state_dict = "./model_in_gpu_0.pkl"
    torch.save(net_state_dict, path_state_dict)

    # load
    # state_dict_load = torch.load(path_state_dict)
    state_dict_load = torch.load(path_state_dict, map_location="cpu")
    print("state_dict_load:\n{}".format(state_dict_load))


# =================================== 多gpu 保存
flag = 0
# flag = 1
if flag:

    if torch.cuda.device_count() < 2:
        print("gpu数量不足，请到多gpu环境下运行")
        import sys
        sys.exit(0)

    gpu_list = [0, 1, 2, 3]
    gpu_list_str = ','.join(map(str, gpu_list))
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    net = FooNet(neural_num=3, layers=3)
    net = nn.DataParallel(net)
    net.to(device)

    # save
    net_state_dict = net.state_dict()
    path_state_dict = "./model_in_multi_gpu.pkl"
    torch.save(net_state_dict, path_state_dict)

# =================================== 多gpu 加载
# flag = 0
flag = 1
if flag:

    net = FooNet(neural_num=3, layers=3)

    path_state_dict = "./model_in_multi_gpu.pkl"
    state_dict_load = torch.load(path_state_dict, map_location="cpu")
    print("state_dict_load:\n{}".format(state_dict_load))

    # net.load_state_dict(state_dict_load)

    # remove module.
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict_load.items():
        namekey = k[7:] if k.startswith('module.') else k
        new_state_dict[namekey] = v
    print("new_state_dict:\n{}".format(new_state_dict))

    net.load_state_dict(new_state_dict)

# 使用特定的
CUDA_VISIBLE_DEVICES=2 python training.py

补充：
这里需要说明的是，device定义中的cuda:0可以不变，无论你多卡的GPU如何选取。device_ids=[0,1,2,3]也相对固定，如果有两张卡就定义device_ids=[0,1]，如果有八张卡就定义device_ids=[0,1,2,3,4,5,6,7]。在多卡并行时，会有一个相对的主卡，就是cuda:0所指向的GPU。介绍到主卡就得引入os.environ[“CUDA_VISIBLE_DEVICES”] = “0,1,2,3”，在这里主卡是0卡。如果定义为cuda:1，os.environ[“CUDA_VISIBLE_DEVICES”] = “1,2,3”，这里主卡是2卡。
提示：需要注意的是device_ids中包含的卡数要和os.environ[“CUDA_VISIBLE_DEVICES”]中包含的卡数相等。这里的卡数是指数量，无需具体卡号一一对应，此外，batch_size的数量需要大于GPU的数量。
OK，最核心的部分给大家介绍完了，剩下的工作就是将之前单卡运行时的所有.cuda()替换为.to(device)即可。

PyTorch框架学习

GPU使用