基础知识 - 线性回归(从零开始) - 《机器学习》

踩坑：矩阵乘法

w = torch.Tensor([[2],
                  [-3.4]])    # 定义了一个1*2维的张量
features = torch.randn(2, 1000, dtype=torch.float32)    # 定义了一个2*1000的张量
labels = features*w    # 虽然维度没错可以相乘，但会报错
labels = torch.mm(features, w)    # 需要调用mm函数完成矩阵乘法

python list(tuple)函数用于将元组将元组等其他变量转变为list

indices = list(range(num_examples))    # range返回的并不是list，该语句的含义是构建一个值从0开始的列表

shuffle(list)函数用于将list中元素随机打乱

random.shuffle(indices) # 打乱刚才得到的list

LongTensor(list)指令用于构建长度较长的行向量

# 构建一个LongTensor，它的值从indice数组中的第i个取到第i+batchsize个，它的长度大小=batch_size
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)])

torch.linspace(start, end, steps=100, out=None)方法返回一个1维张量，包含在区间start和end上均匀间隔的step个点，输出张量的长度由steps决定

#生成0到10的4个数构成的等差数列
a = torch.linspace(0,10,steps=4)
print(a)
#生成0到10的5个数构成的等差数列
b = torch.linspace(0,10,steps=5)
print(b)
结果：
tensor([ 0.0000,  3.3333,  6.6667, 10.0000])
tensor([ 0.0000,  2.5000,  5.0000,  7.5000, 10.0000])

index_select(tensor, dim, index)可以从张量的某个维度的指定位置选取数据

dim：表示从第几维挑选数据，类型为int值
index：表示从第一个参数维度中的哪个位置挑选数据，类型为torch.Tensor类的实例 ```python a = torch.linspace(1, 12, steps=12).view(3, 4) print(a) b = torch.index_select(a, 0, torch.tensor([0, 2])) # 选择第0维，索引0和2位置的数据 print(b) print(a.index_select(0, torch.tensor([0, 2]))) # tensor对象也可调用该方法 c = torch.index_select(a, 1, torch.tensor([1, 3])) # 选择第1维，索引1和3位置的数据 print(c)

结果： tensor([[ 1., 2., 3., 4.], [ 5., 6., 7., 8.], [ 9., 10., 11., 12.]]) tensor([[ 1., 2., 3., 4.], [ 9., 10., 11., 12.]]) tensor([[ 1., 2., 3., 4.], [ 9., 10., 11., 12.]]) tensor([[ 2., 4.], [ 6., 8.], [10., 12.]])

yield与return
- yield返回的是一个生成器对象，该对象可以迭代遍历和通过next()方法取出对象中的值。比较节约内存空间。保存的是生成数据的方式。可以达到随用随取的效果。
- return直接结束该函数的运行，return 后面的代码块不会执行，返回该函数的执行结果。
```python
def func():
    start_time = time.time()
    list = []
    for i in range(1, 100000000):
        list.append(i)
    end_time = time.time()
    cost_time = end_time - start_time
    print(cost_time)
    yield list
func() # 随用随取，节省内存空间。
next(func())
def func():
    start_time = time.time()
    list = []
    for i in range(1, 100000000):
        list.append(i)
    end_time = time.time()
    cost_time = end_time - start_time
    print(cost_time)
    return  list # 相同的代码
func() # 过了几十秒终于跑完了

def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i:min(i + batch_size, num_examples)])
        yield features.index_select(0, j), labels.index_select(0, j)
# 通过迭代方式就完成了每次小批量的训练,每次10步
for X, y in data_iter(10, features, labels):

完整程序代码如下：

import torch
from IPython import display
from matplotlib import pyplot as plt
import random
def use_svg_display():
    # 用矢量图显示
    display.set_matplotlib_formats('svg')
def set_fig_size(fig_size=(3.5, 2.5)):
    use_svg_display()
    # 设置图的尺寸
    plt.rcParams['figure.figsize'] = fig_size
def data_iter(batch_size, features, labels):
    num = len(features)
    indices = list(range(num))
    random.shuffle(indices)
    for i in range(0, num, batch_size):
        j = torch.LongTensor(indices[i:min(num, i + batch_size)])  # 最后一次可能不足一个batch
        yield features.index_select(0, j), labels.index_select(0, j)
def linear_regression(w, x, b):    # 这个跟教程中给的函数不一样，w与b交换了一下位置
    return torch.mm(x, w) + b
def squared_loss(y_hat, y):
    # 注意这里返回的是向量, 另外, pytorch里的MSELoss并没有除以 2
    tmp = y.view(y_hat.size())
    return (y_hat - tmp) ** 2 / 2
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size  # 注意这里更改param时用的param.data
if __name__ == '__main__':
    num_inputs = 2
    num_examples = 1000
    w_true = torch.Tensor([[2],
                           [-3.4]])
    b_true = 4.2
    features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
    labels = torch.mm(features, w_true) + b_true
    labels += torch.normal(mean=0.0, std=0.01, size=labels.size())
    w = torch.normal(mean=0.0, std=0.01, size=[2, 1], dtype=torch.float32, requires_grad=True)
    b = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    set_fig_size()
    plt.scatter(features[:, 1], labels, 1)
    plt.show()
    # 回归（学习）参数设定
    lr = 0.03
    num_epochs = 3
    net = linear_regression
    loss = squared_loss
    size = 10
    for epoch in range(3):  # 训练模型一共需要num_epochs个迭代周期
        # 在每一个迭代周期中，会使用训练数据集中所有样本一次（假设样本数能够被批量大小整除） X和y分别是小批量样本的特征和标签
        for X, y in data_iter(size, features, labels):
            l = loss(net(w, X, b), y).sum()  # l是有关小批量X和y的损失
            l.backward()  # 小批量的损失对模型参数求梯度
            sgd([w, b], lr, size)  # 使用小批量随机梯度下降迭代模型参数
            # 不要忘了梯度清零
            w.grad.data.zero_()
            b.grad.data.zero_()
        train_l = loss(net(w, features, b), labels)
        print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
    print(w_true, '\n', w)    # 比较差异
    print(b_true, '\n', b)

几个注意的地方：

l = loss(net(w, X, b), y).sum()
l.backward()

由于pytorch只能使标量对其他变量求导，因此选取.sum()或.mean()转成标量后再都可以。在sgd函数处加上断点后发现选取.sum()函数后w与b的梯度普遍为两位数，而选取.mean()函数后w与b的梯度仅为一位数。其实原理很简单：
线性回归(从零开始) - 图1
这是sum形式的损失函数

线性回归(从零开始) - 图2
这是mean形式的损失函数

用脚想想都知道，第一个对w和b求完导结果是第二个的bath_size倍。所以，使用.sum()函数w和b梯度更大。不过无所谓，反正参数学习的时候也要/batch_size，最后减的还是均值。

所以说，选取mean函数后，sgd函数中学习参数时就无需/batch_size了，这样之后与原来的结果是一样的。

还有，sgd([w, b], lr, size)里一开始忘记加上b了，折腾了很久时间。以后不能忘记偏置项参数也要进行更新。

框架学习也没那么容易，一步一步来。