数据生成
T=100
masks = [0]*T
masks[2]=1
masks[10]=1
def generate_data(size=100):
train_x = torch.rand(size)
target = list(accumulate([x*mask for x, mask in zip(train_x,masks)]))
return train_x.reshape((-1,1)),torch.Tensor(target).reshape((-1,1))
print(masks)
掩码生成
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
X,target=generate_data()
X=X.to(d2l.try_gpu())
target=target.to(d2l.try_gpu())
print(X.shape,target.shape)
result=target[-1]
print(result)
训练数据和标签,最终结果
torch.Size([100, 1]) torch.Size([100, 1])
tensor([0.9501], device='cuda:0')
定义RNN网络
def get_params(vocab_size, num_hiddens, device):# 输入词表大小,隐藏层大小
num_inputs = num_outputs = vocab_size
def normal(shape):
return torch.randn(size=shape, device=device) * 0.01
"""随机生成一个大小为shape的tensor 均值为0,方差为0.01
因为要多次调用,所以封装起来
"""
# 隐藏层参数
W_xh = normal((num_inputs, num_hiddens))
W_hh = normal((num_hiddens, num_hiddens))
b_h = torch.zeros(num_hiddens, device=device)
# 输出层参数
W_hq = normal((num_hiddens, num_outputs))
b_q = torch.zeros(num_outputs, device=device)
# 附加梯度
params = [W_xh, W_hh, b_h, W_hq, b_q] # 把参数封装起来成一个list
for param in params:
param.requires_grad_(True) # 需要算梯度
return params
def init_rnn_state(batch_size, num_hiddens, device):
return (torch.zeros((batch_size, num_hiddens), device=device), )
def rnn(inputs, state, params):
# inputs的形状:(时间步数量,批量大小,词表大小)
W_xh, W_hh, b_h, W_hq, b_q = params # 列表可以展开对应赋值
H, = state # state是一个初始化tuple
outputs = []
H = torch.tanh(torch.mm(X, W_xh)+ torch.mm(H, W_hh) + b_h) # 激活函数为tanh
Y = torch.mm(H, W_hq) + b_q
outputs.append(Y) # 把所有时刻的输出存储起来
return torch.cat(outputs, dim=0), (H,)
"""dim =0 :按照竖方向拼起来,列数不变,行数变成batch_size*numsteps,即把每一个batch的输出按照时间顺序拼成二维矩阵,输出每个样本的预测值和最后的隐藏状态"""
class RNNModelScratch:
def __init__(self, vocab_size, num_hiddens, device,
get_params, init_state, forward_fn):
self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
self.params = get_params(vocab_size, num_hiddens, device)
self.init_state, self.forward_fn = init_state, forward_fn
def __call__(self, X, state):
return self.forward_fn(X, state, self.params)
def begin_state(self, batch_size, device): # 初始状态
return self.init_state(batch_size, self.num_hiddens, device)
num_hiddens = 512
net = RNNModelScratch(1, num_hiddens, d2l.try_gpu(),
get_params,init_rnn_state, rnn)
state = net.begin_state(X.shape[0], d2l.try_gpu()) # 拿到初始状态
Y, new_state = net(X.to(d2l.try_gpu()), state) # 进行一步前向计算
Y.shape, len(new_state), new_state[0].shape
# Y 第二维是预测向量
训练
def train_ch(net, train_iter, vocab, lr, num_epochs, device,
use_random_iter=False):
loss = nn.MSELoss()
# Initialize
if isinstance(net, nn.Module):
updater = torch.optim.SGD(net.parameters(), lr)
else:
updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
for epoch in range(num_epochs):
state, timer = None, d2l.Timer()
X,Y=train_iter
if state is None or use_random_iter:
state = net.begin_state(batch_size=X.shape[0], device=device)
else:
if isinstance(net, nn.Module) and not isinstance(state, tuple):
state.detach_()
else:
for s in state:
s.detach_()
y = Y.T.reshape(-1).to(torch.float32)
X, y = X.to(device), y.to(device)
y_hat, state = net(X, state)
l = loss(y_hat.to(torch.float32), y.to(torch.float32)).mean()
if isinstance(updater, torch.optim.Optimizer):
updater.zero_grad()
l.backward()
grad_clipping(net, 1)
updater.step()
else:
l.backward()
grad_clipping(net, 1)
# Since the `mean` function has been invoked
updater(batch_size=1)
if (epoch+1)%100==1 and epoch:
print(f"epoch:{epoch} pedict:{net(X,state)[0].detach()[-1]}")
num_epochs, lr = 2000, 0.01
train_iters=(X,target)
train_ch(net, train_iters, 1, lr, num_epochs, d2l.try_gpu())
print(f"result:{result}")
结果
epoch:100 pedict:tensor([0.7851], device='cuda:0')
epoch:200 pedict:tensor([0.8946], device='cuda:0')
epoch:300 pedict:tensor([0.9044], device='cuda:0')
epoch:400 pedict:tensor([0.9055], device='cuda:0')
epoch:500 pedict:tensor([0.9060], device='cuda:0')
epoch:600 pedict:tensor([0.9063], device='cuda:0')
epoch:700 pedict:tensor([0.9067], device='cuda:0')
epoch:800 pedict:tensor([0.9070], device='cuda:0')
epoch:900 pedict:tensor([0.9073], device='cuda:0')
epoch:1000 pedict:tensor([0.9076], device='cuda:0')
epoch:1100 pedict:tensor([0.9079], device='cuda:0')
epoch:1200 pedict:tensor([0.9082], device='cuda:0')
epoch:1300 pedict:tensor([0.9084], device='cuda:0')
epoch:1400 pedict:tensor([0.9087], device='cuda:0')
epoch:1500 pedict:tensor([0.9090], device='cuda:0')
epoch:1600 pedict:tensor([0.9092], device='cuda:0')
epoch:1700 pedict:tensor([0.9095], device='cuda:0')
epoch:1800 pedict:tensor([0.9097], device='cuda:0')
epoch:1900 pedict:tensor([0.9099], device='cuda:0')
result:tensor([0.9501], device='cuda:0')
可以看到最后生生成的数据与标签差别不大,在500epoch后已经趋近收敛
代码在https://github.com/yulinlina/RNNAddition