数据生成

  1. T=100
  2. masks = [0]*T
  3. masks[2]=1
  4. masks[10]=1
  5. def generate_data(size=100):
  6. train_x = torch.rand(size)
  7. target = list(accumulate([x*mask for x, mask in zip(train_x,masks)]))
  8. return train_x.reshape((-1,1)),torch.Tensor(target).reshape((-1,1))
  9. print(masks)

掩码生成

  1. [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  1. X,target=generate_data()
  2. X=X.to(d2l.try_gpu())
  3. target=target.to(d2l.try_gpu())
  4. print(X.shape,target.shape)
  5. result=target[-1]
  6. print(result)

训练数据和标签,最终结果

  1. torch.Size([100, 1]) torch.Size([100, 1])
  2. tensor([0.9501], device='cuda:0')

定义RNN网络

  1. def get_params(vocab_size, num_hiddens, device):# 输入词表大小,隐藏层大小
  2. num_inputs = num_outputs = vocab_size
  3. def normal(shape):
  4. return torch.randn(size=shape, device=device) * 0.01
  5. """随机生成一个大小为shape的tensor 均值为0,方差为0.01
  6. 因为要多次调用,所以封装起来
  7. """
  8. # 隐藏层参数
  9. W_xh = normal((num_inputs, num_hiddens))
  10. W_hh = normal((num_hiddens, num_hiddens))
  11. b_h = torch.zeros(num_hiddens, device=device)
  12. # 输出层参数
  13. W_hq = normal((num_hiddens, num_outputs))
  14. b_q = torch.zeros(num_outputs, device=device)
  15. # 附加梯度
  16. params = [W_xh, W_hh, b_h, W_hq, b_q] # 把参数封装起来成一个list
  17. for param in params:
  18. param.requires_grad_(True) # 需要算梯度
  19. return params
  20. def init_rnn_state(batch_size, num_hiddens, device):
  21. return (torch.zeros((batch_size, num_hiddens), device=device), )
  22. def rnn(inputs, state, params):
  23. # inputs的形状:(时间步数量,批量大小,词表大小)
  24. W_xh, W_hh, b_h, W_hq, b_q = params # 列表可以展开对应赋值
  25. H, = state # state是一个初始化tuple
  26. outputs = []
  27. H = torch.tanh(torch.mm(X, W_xh)+ torch.mm(H, W_hh) + b_h) # 激活函数为tanh
  28. Y = torch.mm(H, W_hq) + b_q
  29. outputs.append(Y) # 把所有时刻的输出存储起来
  30. return torch.cat(outputs, dim=0), (H,)
  31. """dim =0 :按照竖方向拼起来,列数不变,行数变成batch_size*numsteps,即把每一个batch的输出按照时间顺序拼成二维矩阵,输出每个样本的预测值和最后的隐藏状态"""
  32. class RNNModelScratch:
  33. def __init__(self, vocab_size, num_hiddens, device,
  34. get_params, init_state, forward_fn):
  35. self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
  36. self.params = get_params(vocab_size, num_hiddens, device)
  37. self.init_state, self.forward_fn = init_state, forward_fn
  38. def __call__(self, X, state):
  39. return self.forward_fn(X, state, self.params)
  40. def begin_state(self, batch_size, device): # 初始状态
  41. return self.init_state(batch_size, self.num_hiddens, device)
num_hiddens = 512
net = RNNModelScratch(1, num_hiddens, d2l.try_gpu(),
                    get_params,init_rnn_state, rnn)
state = net.begin_state(X.shape[0], d2l.try_gpu()) # 拿到初始状态
Y, new_state = net(X.to(d2l.try_gpu()), state) # 进行一步前向计算
Y.shape, len(new_state), new_state[0].shape
# Y    第二维是预测向量

训练

def train_ch(net, train_iter, vocab, lr, num_epochs, device,
              use_random_iter=False):
    loss = nn.MSELoss()
    # Initialize
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    for epoch in range(num_epochs):
        state, timer = None, d2l.Timer()
        X,Y=train_iter
        if state is None or use_random_iter:
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                state.detach_()
            else:
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1).to(torch.float32)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        l = loss(y_hat.to(torch.float32), y.to(torch.float32)).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            # Since the `mean` function has been invoked
            updater(batch_size=1)
        if (epoch+1)%100==1 and epoch:
            print(f"epoch:{epoch}  pedict:{net(X,state)[0].detach()[-1]}")
num_epochs, lr = 2000, 0.01
train_iters=(X,target)
train_ch(net, train_iters, 1, lr, num_epochs, d2l.try_gpu())
print(f"result:{result}")

结果

epoch:100  pedict:tensor([0.7851], device='cuda:0')
epoch:200  pedict:tensor([0.8946], device='cuda:0')
epoch:300  pedict:tensor([0.9044], device='cuda:0')
epoch:400  pedict:tensor([0.9055], device='cuda:0')
epoch:500  pedict:tensor([0.9060], device='cuda:0')
epoch:600  pedict:tensor([0.9063], device='cuda:0')
epoch:700  pedict:tensor([0.9067], device='cuda:0')
epoch:800  pedict:tensor([0.9070], device='cuda:0')
epoch:900  pedict:tensor([0.9073], device='cuda:0')
epoch:1000  pedict:tensor([0.9076], device='cuda:0')
epoch:1100  pedict:tensor([0.9079], device='cuda:0')
epoch:1200  pedict:tensor([0.9082], device='cuda:0')
epoch:1300  pedict:tensor([0.9084], device='cuda:0')
epoch:1400  pedict:tensor([0.9087], device='cuda:0')
epoch:1500  pedict:tensor([0.9090], device='cuda:0')
epoch:1600  pedict:tensor([0.9092], device='cuda:0')
epoch:1700  pedict:tensor([0.9095], device='cuda:0')
epoch:1800  pedict:tensor([0.9097], device='cuda:0')
epoch:1900  pedict:tensor([0.9099], device='cuda:0')
result:tensor([0.9501], device='cuda:0')

可以看到最后生生成的数据与标签差别不大,在500epoch后已经趋近收敛
代码在https://github.com/yulinlina/RNNAddition