Pytorch学习5 - Pytorch的简单使用

/ 技术文章 / 0 条评论 / 1350浏览

Pytorch学习5 - Pytorch的简单使用

来自七月在线视频,可在B站观看 https://www.bilibili.com/video/BV12741177Cu

跟着视频学习Pytorch

用numpy实现两层神经网络

一个全连接ReLU神经网络,一个因长城,没有bias,用来从x预测有,使用L2 Loss。

这一实现完全使用numpy来计算前向盛景网可以,loss,和反向传播。 构建神经网络的三个步骤:

numpy ndarray是一盒普通的n维数组,它不知道任何关于深度学习或者梯度的知识,也不知道计算图,知识用来计算数学运算的数学结构

import torch
import numpy as np

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = np.random.randn(N, D_in)
Y = np.random.randn(N, D_out)

# 简单起见,b1 b2都设为0
w1 = np.random.randn( D_in, H)
w2 = np.random.randn( H, D_out)

learning_rate = 1e-6

# 训练500轮
for t in range(500):
    # Forward pass
    H = X.dot(w1)
    A = np.maximum(H, 0)
    Y_pre = A.dot(w2)
    
    # Compute Loss (L2均方误差)
    loss = np.square(Y_pre - Y).sum()
    print(loss)
    
    # Backword pass
    # 计算梯度 Compute gradients
    # 用Loss对参数求导
    grad_Y_pred = 2.0 * (Y_pre - Y)
    grad_w2 = A.T.dot(grad_Y_pred)
    grad_A = grad_Y_pred.dot(w2.T)
    grad_H = grad_A.copy()
    grad_H[H<0] = 0
    grad_w1 = X.T.dot(grad_H)
    
    # 根据Loss更新w1 w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

改成Pytorch计算

全程照搬上面的例子,将numpy的方法替换为torch的方法:

import torch

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = torch.randn(N, D_in)
Y = torch.randn(N, D_out)

# 简单起见,b1 b2都设为0
w1 = torch.randn( D_in, H)
w2 = torch.randn( H, D_out)

learning_rate = 1e-6

# 训练500轮
for t in range(500):
    # Forward pass
    H = X.mm(w1)
    A = H.clamp(min=0)
    Y_pre = A.mm(w2)
    
    # Compute Loss (L2均方误差)
    loss = (Y_pre - Y).pow(2).sum().item()
    print(t, loss)
    
    # Backword pass
    # 计算梯度 Compute gradients
    # 用Loss对参数求导
    grad_Y_pred = 2.0 * (Y_pre - Y)
    grad_w2 = A.t().mm(grad_Y_pred)
    grad_A = grad_Y_pred.mm(w2.t())
    grad_H = grad_A.clone()
    grad_H[H<0] = 0
    grad_w1 = X.t().mm(grad_H)
    
    # 根据Loss更新w1 w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Pytorch 的autograd

使用Pytorch 的autograd,简化后向传播的代码。

import torch

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = torch.randn(N, D_in, requires_grad=True)
Y = torch.randn(N, D_out, requires_grad=True)

# 简单起见,b1 b2都设为0
w1 = torch.randn( D_in, H, requires_grad=True)
w2 = torch.randn( H, D_out, requires_grad=True)

learning_rate = 1e-6

# 训练500轮
for t in range(500):
    # Forward pass
    Y_pre = X.mm(w1).clamp(min=0).mm(w2)
    
    # Compute Loss (L2均方误差)
    loss = (Y_pre - Y).pow(2).sum()
    print(t, loss.item())
    
    # Backword pass
    # 计算梯度 Compute gradients
    # 用Loss对参数求导
    loss.backward()
    
    # 根据Loss更新w1 w2
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

Pytorch 内置神经网络nn

使用Pytorch 的nn库,简化模型定义方法。

import torch
import torch.nn as nn

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = torch.randn(N, D_in, requires_grad=True)
Y = torch.randn(N, D_out, requires_grad=True)

# 简单起见,b1 b2都设为0
model = nn.Sequential(
    nn.Linear(D_in, H, bias = False), # w_1 * x + b_1
    nn.ReLU(),
    nn.Linear(H, D_out, bias = False)
)

# 初始化影响效果, SGD标准化之后效果好一点
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)


if torch.cuda.is_available():
    X = X.to("cuda")
    Y = Y.to("cuda")
    model = model.to("cuda")

learning_rate = 1e-6
loss_fn = nn.MSELoss(reduction='sum')

# 训练500轮
for t in range(500):
    # Forward pass
    Y_pre = model(X) # model.forward()
    
    # Compute Loss (L2均方误差)
    loss = loss_fn(Y_pre, Y)
    print(t, loss.item())
    
    # Backword pass
    # 计算梯度 Compute gradients
    # 用Loss对参数求导
    loss.backward()
    
    # 根据Loss更新w1 w2
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
        
        model.zero_grad()

使用Pytorch 的optimizer

使用Pytorch 的optimizer,简化梯度计算,梯度下降方法。使用更强大的优化器Adam,得到更好地效果

import torch
import torch.nn as nn

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = torch.randn(N, D_in, requires_grad=True)
Y = torch.randn(N, D_out, requires_grad=True)

# 简单起见,b1 b2都设为0
model = nn.Sequential(
    nn.Linear(D_in, H, bias = False), # w_1 * x + b_1
    nn.ReLU(),
    nn.Linear(H, D_out, bias = False)
)

if torch.cuda.is_available():
    X = X.to("cuda")
    Y = Y.to("cuda")
    model = model.to("cuda")

learning_rate = 1e-4 # Adam优化器 1e-3 ~ 1e-4效果比较好
loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# 训练500轮
for t in range(500):
    # Forward pass
    Y_pre = model(X) # model.forward()
    
    # Compute Loss (L2均方误差)
    loss = loss_fn(Y_pre, Y)
    print(t, loss.item())
    
    # 求导之前清空梯度
    optimizer.zero_grad()
    
    # Backword pass
    loss.backward()
    
    # 根据Loss更新w1 w2
    optimizer.step()

自定义神经网络

集成nn.Module,可自定义神经网络。该方法可适用于结构更为复杂的神经网络定义方法

import torch
import torch.nn as nn

# 64个输入
# 输入是1000维的矩阵
# 中间隐藏层 100 维
# 输出是10维矩阵
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
X = torch.randn(N, D_in, requires_grad=True)
Y = torch.randn(N, D_out, requires_grad=True)

# 定义模型
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H, bias = False) # w_1 * x + b_1
        self.linear2 = nn.Linear(H, D_out, bias = False)
        
    def forward(self, X):
        Y_pre = self.linear2(self.linear1(X).clamp(min=0)) # ReLU
        return Y_pre


# 简单起见,b1 b2都设为0
model = TwoLayerNet(D_in, H, D_out)

if torch.cuda.is_available():
    X = X.to("cuda")
    Y = Y.to("cuda")
    model = model.to("cuda")

learning_rate = 1e-4 # Adam优化器 1e-3 ~ 1e-4效果比较好
loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# 训练500轮
for t in range(500):
    # Forward pass
    Y_pre = model(X) # model.forward()
    
    # Compute Loss (L2均方误差)
    loss = loss_fn(Y_pre, Y)
    print(t, loss.item())
    
    # 求导之前清空梯度
    optimizer.zero_grad()
    
    # Backword pass
    loss.backward()
    
    # 根据Loss更新w1 w2
    optimizer.step()