数据生成

  1. T=100
  2. masks = [0]*T
  3. masks[2]=1
  4. masks[10]=1
  5. def generate_data(size=100):
  6. train_x = torch.rand(size)
  7. target = list(accumulate([x*mask for x, mask in zip(train_x,masks)]))
  8. return train_x.reshape((-1,1)),torch.Tensor(target).reshape((-1,1))
  9. print(masks)

掩码生成

  1. [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  1. X,target=generate_data()
  2. X=X.to(d2l.try_gpu())
  3. target=target.to(d2l.try_gpu())
  4. print(X.shape,target.shape)
  5. result=target[-1]
  6. print(result)

训练数据和标签,最终结果

  1. torch.Size([100, 1]) torch.Size([100, 1])
  2. tensor([0.9501], device='cuda:0')

定义RNN网络

  1. def get_params(vocab_size, num_hiddens, device):# 输入词表大小,隐藏层大小
  2. num_inputs = num_outputs = vocab_size
  3. def normal(shape):
  4. return torch.randn(size=shape, device=device) * 0.01
  5. """随机生成一个大小为shape的tensor 均值为0,方差为0.01
  6. 因为要多次调用,所以封装起来
  7. """
  8. # 隐藏层参数
  9. W_xh = normal((num_inputs, num_hiddens))
  10. W_hh = normal((num_hiddens, num_hiddens))
  11. b_h = torch.zeros(num_hiddens, device=device)
  12. # 输出层参数
  13. W_hq = normal((num_hiddens, num_outputs))
  14. b_q = torch.zeros(num_outputs, device=device)
  15. # 附加梯度
  16. params = [W_xh, W_hh, b_h, W_hq, b_q] # 把参数封装起来成一个list
  17. for param in params:
  18. param.requires_grad_(True) # 需要算梯度
  19. return params
  20. def init_rnn_state(batch_size, num_hiddens, device):
  21. return (torch.zeros((batch_size, num_hiddens), device=device), )
  22. def rnn(inputs, state, params):
  23. # inputs的形状:(时间步数量,批量大小,词表大小)
  24. W_xh, W_hh, b_h, W_hq, b_q = params # 列表可以展开对应赋值
  25. H, = state # state是一个初始化tuple
  26. outputs = []
  27. H = torch.tanh(torch.mm(X, W_xh)+ torch.mm(H, W_hh) + b_h) # 激活函数为tanh
  28. Y = torch.mm(H, W_hq) + b_q
  29. outputs.append(Y) # 把所有时刻的输出存储起来
  30. return torch.cat(outputs, dim=0), (H,)
  31. """dim =0 :按照竖方向拼起来,列数不变,行数变成batch_size*numsteps,即把每一个batch的输出按照时间顺序拼成二维矩阵,输出每个样本的预测值和最后的隐藏状态"""
  32. class RNNModelScratch:
  33. def __init__(self, vocab_size, num_hiddens, device,
  34. get_params, init_state, forward_fn):
  35. self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
  36. self.params = get_params(vocab_size, num_hiddens, device)
  37. self.init_state, self.forward_fn = init_state, forward_fn
  38. def __call__(self, X, state):
  39. return self.forward_fn(X, state, self.params)
  40. def begin_state(self, batch_size, device): # 初始状态
  41. return self.init_state(batch_size, self.num_hiddens, device)
  1. num_hiddens = 512
  2. net = RNNModelScratch(1, num_hiddens, d2l.try_gpu(),
  3. get_params,init_rnn_state, rnn)
  4. state = net.begin_state(X.shape[0], d2l.try_gpu()) # 拿到初始状态
  5. Y, new_state = net(X.to(d2l.try_gpu()), state) # 进行一步前向计算
  6. Y.shape, len(new_state), new_state[0].shape
  7. # Y 第二维是预测向量

训练

  1. def train_ch(net, train_iter, vocab, lr, num_epochs, device,
  2. use_random_iter=False):
  3. loss = nn.MSELoss()
  4. # Initialize
  5. if isinstance(net, nn.Module):
  6. updater = torch.optim.SGD(net.parameters(), lr)
  7. else:
  8. updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
  9. for epoch in range(num_epochs):
  10. state, timer = None, d2l.Timer()
  11. X,Y=train_iter
  12. if state is None or use_random_iter:
  13. state = net.begin_state(batch_size=X.shape[0], device=device)
  14. else:
  15. if isinstance(net, nn.Module) and not isinstance(state, tuple):
  16. state.detach_()
  17. else:
  18. for s in state:
  19. s.detach_()
  20. y = Y.T.reshape(-1).to(torch.float32)
  21. X, y = X.to(device), y.to(device)
  22. y_hat, state = net(X, state)
  23. l = loss(y_hat.to(torch.float32), y.to(torch.float32)).mean()
  24. if isinstance(updater, torch.optim.Optimizer):
  25. updater.zero_grad()
  26. l.backward()
  27. grad_clipping(net, 1)
  28. updater.step()
  29. else:
  30. l.backward()
  31. grad_clipping(net, 1)
  32. # Since the `mean` function has been invoked
  33. updater(batch_size=1)
  34. if (epoch+1)%100==1 and epoch:
  35. print(f"epoch:{epoch} pedict:{net(X,state)[0].detach()[-1]}")
  36. num_epochs, lr = 2000, 0.01
  37. train_iters=(X,target)
  38. train_ch(net, train_iters, 1, lr, num_epochs, d2l.try_gpu())
  39. print(f"result:{result}")

结果

  1. epoch:100 pedict:tensor([0.7851], device='cuda:0')
  2. epoch:200 pedict:tensor([0.8946], device='cuda:0')
  3. epoch:300 pedict:tensor([0.9044], device='cuda:0')
  4. epoch:400 pedict:tensor([0.9055], device='cuda:0')
  5. epoch:500 pedict:tensor([0.9060], device='cuda:0')
  6. epoch:600 pedict:tensor([0.9063], device='cuda:0')
  7. epoch:700 pedict:tensor([0.9067], device='cuda:0')
  8. epoch:800 pedict:tensor([0.9070], device='cuda:0')
  9. epoch:900 pedict:tensor([0.9073], device='cuda:0')
  10. epoch:1000 pedict:tensor([0.9076], device='cuda:0')
  11. epoch:1100 pedict:tensor([0.9079], device='cuda:0')
  12. epoch:1200 pedict:tensor([0.9082], device='cuda:0')
  13. epoch:1300 pedict:tensor([0.9084], device='cuda:0')
  14. epoch:1400 pedict:tensor([0.9087], device='cuda:0')
  15. epoch:1500 pedict:tensor([0.9090], device='cuda:0')
  16. epoch:1600 pedict:tensor([0.9092], device='cuda:0')
  17. epoch:1700 pedict:tensor([0.9095], device='cuda:0')
  18. epoch:1800 pedict:tensor([0.9097], device='cuda:0')
  19. epoch:1900 pedict:tensor([0.9099], device='cuda:0')
  20. result:tensor([0.9501], device='cuda:0')

可以看到最后生生成的数据与标签差别不大,在500epoch后已经趋近收敛
代码在https://github.com/yulinlina/RNNAddition