我的代码如下:
import torch from torchvision import transforms from torch.utils import data 导入 torchvision #==============load 数据集 def get_dataloader_workers(): 返回 4 def load_data_fashion_mnist(batch_size,resize=None): trans = [transforms.ToTensor()] if resize: trans.insert(0,transforms.Resize(resize)) trans = transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST( root="../data",train=True,transform=trans,download=True) mnist_test = torchvision.datasets.FashionMNIST( root="../data", train=False, transform=trans, download=True) return (data.DataLoader(mnist_train,batch_size,shuffle=True,num_workers=get_dataloader_workers())、 data.DataLoader(mnist_test,batch_size,shuffle=False,num_workers=get_dataloader_workers())) batch_size = 256 train_iter,test_iter = load_data_fashion_mnist(batch_size) #=============model 参数 num_inputs, num_outputs, num_hiddens = 784,10,256 # 参数初始化 W1 = torch.randn((num_inputs,num_hiddens),requires_grad=True)*0.01 b1 = torch.zeros(num_hiddens,requires_grad=True) W2 = torch.randn((num_hiddens,num_outputs),requires_grad=True)*0.01 b2 = torch.zeros(num_outputs,requires_grad=True) #===========activation func def relu(X): a = torch.zeros_like(X) return torch.max(X,a) def softmax(X): Exp_x = torch.exp(X- X.max(dim=1, keepdim=True)[0]) partition = Exp_x.sum(axis=1,keepdim=True) 返回 Exp_x / partition #===========model def mlp(X): """接收输入数据"""; X = X.reshape((-1,num_inputs))#展平图像为 784 的长度 H = relu(torch.matmul(X,W1) + b1) #隐藏层 O = softmax(torch.matmul(H,W2) + b2) 返回 O def cross_entropy_loss(y_hat,y): return -torch.log(y_hat[range(len(y_hat)),y]) def sgd(params,lr,batch_size): """minibatch sgd"""; 使用 torch.no_grad(): for params 中的 param param -= lr*param.grad/batch_size 参数.grad.zero_() lr = 0.03 num_epochs = 3 net = mlp 损耗 = 交叉熵损耗 def train(): for epoch in range(num_epochs): for X,y in train_iter: y_hat = net(X) print(y_hat[0]) print(y_hat.shape) print(y) l = loss(y_hat,y) print("The shape of loss: ",l.shape) l.sum().backward() assert b2.grad is not None,"b2.grad is none"; 断言 W2.grad 不是 None,"W2.grad 为 None"; sgd([W1,b1,W2,b2],lr,batch_size) 用 torch.no_grad(): train_l = loss(net(X),y).mean() print(f' epoch {epoch+1}, loss {float(train_l.mean()):f}') if __name__=="__main__": train()
当我运行代码时,出现了异常在此输入图片描述 但是我检查了前向传递和后向传递,仍然让我感到困惑。有谁能帮助修复这段代码?请帮帮我!
我试着打印 b2.grad,它正常工作;
但是当我打印 W2.grad 时,却出现了
W2.grad is none
异常
The error message "W2.grad is none" arises because you are only performing the forward pass on your training data
once per epoch
, and then you calculate the gradients on that same batch. Since
W2.grad
is dependent on the forward pass through the network, it only gets calculated once per epoch, leading to the error when you try to access it on subsequent batches.
Here's a breakdown of the issue and how to fix it:
Problem:
-
Single Forward Pass Per Epoch:
You only call
net(X)
once within the inner loop of your training loop. This means the forward pass and subsequent gradient calculation happen only once per epoch. -
Stale Gradients:
When you call
sgd([W1, b1, W2, b2], lr, batch_size)
, you are using gradients calculated from the very first batch of that epoch for all subsequent batches in that epoch.
Solution:
- Move Forward Pass and Loss Calculation Inside the Batch Loop: You need to calculate the forward pass, the loss, and the gradients for each batch in your training data. This ensures the gradients are calculated and applied correctly for each batch.
Here's the corrected code:
import torch
from torchvision import transforms
from torch.utils import data
import torchvision
# ==============load 数据集
def get_dataloader_workers():
return 4
def load_data_fashion_mnist(batch_size, resize=None):
trans = [transforms.ToTensor()]
if resize:
trans.insert(0, transforms.Resize(resize))
trans = transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(
root="../data", train=True, transform=trans, download=True
)
mnist_test = torchvision.datasets.FashionMNIST(
root="../data", train=False, transform=trans, download=True
)
return (
data.DataLoader(
mnist_train, batch_size, shuffle=True, num_workers=get_dataloader_workers()
),
data.DataLoader(
mnist_test, batch_size, shuffle=False, num_workers=get_dataloader_workers()
),
)
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)
# =============model 参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256
# 参数初始化
W1 = torch.randn((num_inputs, num_hiddens), requires_grad=True) * 0.01
b1 = torch.zeros(num_hiddens, requires_grad=True)
W2 = torch.randn((num_hiddens, num_outputs), requires_grad=True) * 0.01
b2 = torch.zeros(num_outputs, requires_grad=True)
# ===========activation func
def relu(X):
a = torch.zeros_like(X)
return torch.max(X, a)
def softmax(X):
Exp_x = torch.exp(X - X.max(dim=1, keepdim=True)[0])
partition = Exp_x.sum(axis=1, keepdim=True)
return Exp_x / partition
# ===========model
def mlp(X):
"""接收输入数据"""
X = X.reshape((-1, num_inputs)) # 展平图像为 784 的长度
H = relu(torch.matmul(X, W1) + b1) # 隐藏层
O = softmax(torch.matmul(H, W2) + b2)
return O
def cross_entropy_loss(y_hat, y):
return -torch.log(y_hat[range(len(y_hat)), y])
def sgd(params, lr, batch_size):
"""minibatch sgd"""
with torch.no_grad():
for param in params:
param -= lr * param.grad / batch_size
param.grad.zero_()
lr = 0.03
num_epochs = 3
net = mlp
loss = cross_entropy_loss
def train():
for epoch in range(num_epochs):
for X, y in train_iter:
# ** Calculate forward pass, loss, and gradients for each batch **
y_hat = net(X)
l = loss(y_hat, y)
# Backward pass and optimization
l.sum().backward()
sgd([W1, b1, W2, b2], lr, batch_size)
with torch.no_grad():
# Calculate training loss at the end of each epoch
train_l = loss(net(X), y).mean()
print(f" epoch {epoch+1}, loss {float(train_l.mean()):f}")
if __name__ == "__main__":
train()
Explanation of Changes:
-
Forward Pass and Loss Calculation Within Batch Loop:
The code now calculates
y_hat
andl
(the loss) inside the inner loop, ensuring that gradients are calculated for every batch. -
Gradient Calculation and Optimization:
The
l.sum().backward()
andsgd(...)
calls remain within the batch loop to update the weights and biases based on the gradients calculated for each individual batch.
By making this change, your model will now train correctly as the gradients are calculated and applied for each batch of data, addressing the root cause of the "W2.grad is none" error.
标签:pytorch,mlp From: 78562670