Dive_into_Pytorch - （1）全连接层建立 - 《DeepLearning》

内容可见注释
#simple note in learning pytorch
import torch
import numpy
'''
# create tensor
x = torch.empty(5,3)
y = torch.rand(5,3)
z = torch.zeros(5,3)
print(y)
print(x)
print(z)
print (x.dtype)
print(z.dtype)
a=torch.tensor([5,5,3])
print(a)
b=torch.tensor([6,6,8])
print(b)
#basic operation
b=a+b
print(b)
b=torch.add(a,b)
print(b)
#special in place
b.add_(a)
print(b)
#resize
test=torch.rand(4,2)
print(test)
test=test.view(8)
print(test)
test=torch.rand(1)
print(test)
test2=test.item()
print(test2)
# transition between torch and numpy
# they share the data
a=torch.ones(5)
print (a)
b=a.numpy()
print(b)
b[1]=2
print(a)
print(b)
a =numpy.ones(5)
b=torch.from_numpy(a)
print(a)
print(b)
# if you have a GPU you can .....
if torch.cuda.is_available():
    device=torch.device("cuda")
    print("yes I have")
    y=torch.ones_like(b,device=device)
    b=b.to(device)
#warming up
# try to build a simple two layer neural net (using numpy)
N,D_in,H,D_out = 64,1000,100,10
x=numpy.random.randn(N,D_in)
y=numpy.random.randn(N,D_out)
w1=numpy.random.randn(D_in,H)
w2=numpy.random.randn(H,D_out)
learning_rate=1e-6
for t in range(500):
    # forward pass
    h=x.dot(w1)
    h_Relu=numpy.maximum(h,0)
    y_pred=h_Relu.dot(w2)
    #compute loss and use square loss
    loss = numpy.square(y_pred-y).sum()
    print(t,loss)
    #backward pass
    # 1. compute gradiend
    grad_y_pred=2.0*(y_pred-y)
    grad_w2 =h_Relu.T.dot(grad_y_pred)
    grad_h_relu=grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0]=0
    grad_w1 = x.T.dot(grad_h)
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
#now try to use torch to do the job
N,D_in,H,D_out = 64,1000,100,10
x=torch.randn(N,D_in)
y=torch.randn(N,D_out)
w1=torch.randn(D_in,H)
w2=torch.randn(H,D_out)
learning_rate=1e-6
for t in range(500):
    # forward pass
    h=x.mm(w1)
    h_Relu=h.clamp(min=0)
    y_pred=h_Relu.mm(w2)
    #compute loss and use square loss
    loss = (y_pred-y).pow(2).sum().item()
    print(t,loss)
    #backward pass
    # 1. compute gradiend
    grad_y_pred=2.0*(y_pred-y)
    grad_w2 =h_Relu.t().mm(grad_y_pred)
    grad_h_relu=grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0]=0
    grad_w1 = x.T.mm(grad_h)
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
#final version by using auto_backward
N,D_in,H,D_out = 64,1000,100,10
x=torch.randn(N,D_in,requires_grad=True)#default is False
y=torch.randn(N,D_out,requires_grad=True)
w1=torch.randn(D_in,H,requires_grad=True)
w2=torch.randn(H,D_out,requires_grad=True)
learning_rate=1e-6
for t in range(500):
    # forward pass
    y_pred =x.mm(w1).clamp(min=0).mm(w2)
    #compute loss and use square loss
    loss = (y_pred-y).pow(2).sum()
    #
    loss_num=loss.item()
    print(t, loss_num)
    #backward pass
    # compute gradiend
    loss.backward()
    # Update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()
'''
import torch.nn as nn
import time
time_start=time.time()
N,D_in,H,D_out = 64,1000,100,10
x=torch.randn(N,D_in,requires_grad=True)#default is False
y=torch.randn(N,D_out,requires_grad=True)
model=torch.nn.Sequential(
    torch.nn.Linear(D_in,H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)
if torch.cuda.is_available():
    model=model.cuda()
    x=x.cuda()
    y=y.cuda()
loss_func=nn.MSELoss(reduction='sum')
learning_rate=1e-6
for t in range(20000):
    # forward pass
    y_pred =model(x)
    #compute loss and use square loss
    loss = loss_func(y_pred , y)
    #loss_num=loss.item()
    print(t)
    #backward pass
    # compute gradiend
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param-=learning_rate*param.grad
        model.zero_grad()
    # Update weights
time_end=time.time()
print('totally cost',time_end-time_start)
最后分别使用GPU和CPU运行尝试
从而有了若干问题
使用CPU运行时间只有GPU一半，，反而更快
使用了模型进行统一化之后，反而收敛效果更差，loss值降不下来