数据生成
T=100masks = [0]*Tmasks[2]=1masks[10]=1def generate_data(size=100):train_x = torch.rand(size)target = list(accumulate([x*mask for x, mask in zip(train_x,masks)]))return train_x.reshape((-1,1)),torch.Tensor(target).reshape((-1,1))print(masks)
掩码生成
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
X,target=generate_data()X=X.to(d2l.try_gpu())target=target.to(d2l.try_gpu())print(X.shape,target.shape)result=target[-1]print(result)
训练数据和标签,最终结果
torch.Size([100, 1]) torch.Size([100, 1])tensor([0.9501], device='cuda:0')
定义RNN网络
def get_params(vocab_size, num_hiddens, device):# 输入词表大小,隐藏层大小num_inputs = num_outputs = vocab_sizedef normal(shape):return torch.randn(size=shape, device=device) * 0.01"""随机生成一个大小为shape的tensor 均值为0,方差为0.01因为要多次调用,所以封装起来"""# 隐藏层参数W_xh = normal((num_inputs, num_hiddens))W_hh = normal((num_hiddens, num_hiddens))b_h = torch.zeros(num_hiddens, device=device)# 输出层参数W_hq = normal((num_hiddens, num_outputs))b_q = torch.zeros(num_outputs, device=device)# 附加梯度params = [W_xh, W_hh, b_h, W_hq, b_q] # 把参数封装起来成一个listfor param in params:param.requires_grad_(True) # 需要算梯度return paramsdef init_rnn_state(batch_size, num_hiddens, device):return (torch.zeros((batch_size, num_hiddens), device=device), )def rnn(inputs, state, params):# inputs的形状:(时间步数量,批量大小,词表大小)W_xh, W_hh, b_h, W_hq, b_q = params # 列表可以展开对应赋值H, = state # state是一个初始化tupleoutputs = []H = torch.tanh(torch.mm(X, W_xh)+ torch.mm(H, W_hh) + b_h) # 激活函数为tanhY = torch.mm(H, W_hq) + b_qoutputs.append(Y) # 把所有时刻的输出存储起来return torch.cat(outputs, dim=0), (H,)"""dim =0 :按照竖方向拼起来,列数不变,行数变成batch_size*numsteps,即把每一个batch的输出按照时间顺序拼成二维矩阵,输出每个样本的预测值和最后的隐藏状态"""class RNNModelScratch:def __init__(self, vocab_size, num_hiddens, device,get_params, init_state, forward_fn):self.vocab_size, self.num_hiddens = vocab_size, num_hiddensself.params = get_params(vocab_size, num_hiddens, device)self.init_state, self.forward_fn = init_state, forward_fndef __call__(self, X, state):return self.forward_fn(X, state, self.params)def begin_state(self, batch_size, device): # 初始状态return self.init_state(batch_size, self.num_hiddens, device)
num_hiddens = 512net = RNNModelScratch(1, num_hiddens, d2l.try_gpu(),get_params,init_rnn_state, rnn)state = net.begin_state(X.shape[0], d2l.try_gpu()) # 拿到初始状态Y, new_state = net(X.to(d2l.try_gpu()), state) # 进行一步前向计算Y.shape, len(new_state), new_state[0].shape# Y 第二维是预测向量
训练
def train_ch(net, train_iter, vocab, lr, num_epochs, device,use_random_iter=False):loss = nn.MSELoss()# Initializeif isinstance(net, nn.Module):updater = torch.optim.SGD(net.parameters(), lr)else:updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)for epoch in range(num_epochs):state, timer = None, d2l.Timer()X,Y=train_iterif state is None or use_random_iter:state = net.begin_state(batch_size=X.shape[0], device=device)else:if isinstance(net, nn.Module) and not isinstance(state, tuple):state.detach_()else:for s in state:s.detach_()y = Y.T.reshape(-1).to(torch.float32)X, y = X.to(device), y.to(device)y_hat, state = net(X, state)l = loss(y_hat.to(torch.float32), y.to(torch.float32)).mean()if isinstance(updater, torch.optim.Optimizer):updater.zero_grad()l.backward()grad_clipping(net, 1)updater.step()else:l.backward()grad_clipping(net, 1)# Since the `mean` function has been invokedupdater(batch_size=1)if (epoch+1)%100==1 and epoch:print(f"epoch:{epoch} pedict:{net(X,state)[0].detach()[-1]}")num_epochs, lr = 2000, 0.01train_iters=(X,target)train_ch(net, train_iters, 1, lr, num_epochs, d2l.try_gpu())print(f"result:{result}")
结果
epoch:100 pedict:tensor([0.7851], device='cuda:0')epoch:200 pedict:tensor([0.8946], device='cuda:0')epoch:300 pedict:tensor([0.9044], device='cuda:0')epoch:400 pedict:tensor([0.9055], device='cuda:0')epoch:500 pedict:tensor([0.9060], device='cuda:0')epoch:600 pedict:tensor([0.9063], device='cuda:0')epoch:700 pedict:tensor([0.9067], device='cuda:0')epoch:800 pedict:tensor([0.9070], device='cuda:0')epoch:900 pedict:tensor([0.9073], device='cuda:0')epoch:1000 pedict:tensor([0.9076], device='cuda:0')epoch:1100 pedict:tensor([0.9079], device='cuda:0')epoch:1200 pedict:tensor([0.9082], device='cuda:0')epoch:1300 pedict:tensor([0.9084], device='cuda:0')epoch:1400 pedict:tensor([0.9087], device='cuda:0')epoch:1500 pedict:tensor([0.9090], device='cuda:0')epoch:1600 pedict:tensor([0.9092], device='cuda:0')epoch:1700 pedict:tensor([0.9095], device='cuda:0')epoch:1800 pedict:tensor([0.9097], device='cuda:0')epoch:1900 pedict:tensor([0.9099], device='cuda:0')result:tensor([0.9501], device='cuda:0')
可以看到最后生生成的数据与标签差别不大,在500epoch后已经趋近收敛
代码在https://github.com/yulinlina/RNNAddition
