下面实现了深度学习中的几种优化器,包括SGD,Momentum, Nesterov,AdaGrad,RMSProp,AdaDelta,Adam和AdamW。
代码如下:
import torch import torch.nn as nn from torchvision import transforms,datasets device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class SGD: def __init__(self, model, lr=0.001): self.lr = lr self.model = model def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for p in self.model.parameters(): if p.requires_grad: p -= self.lr * p.grad class Momentum: def __init__(self, model, lr=0.001, momentum=0.9): self.lr = lr self.momentum = momentum self.model = model self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.v[id] = self.momentum*self.v[id] - self.lr*p.grad p += self.v[id] class Nesterov: """(http://arxiv.org/abs/1212.0901)""" def __init__(self, model, lr=0.001, momentum=0.9): self.lr = lr self.momentum = momentum self.model = model self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: p += self.momentum * self.momentum * self.v[id] - ((1 + self.momentum) * self.lr * p.grad) self.v[id] = self.momentum*self.v[id] - self.lr * p.grad class AdaGrad: def __init__(self, model, lr=0.001): self.lr = lr self.model = model self.v = {} for id,p in enumerate(model.parameters()): if p.requires_grad: self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.v[id] += p.grad * p.grad p -= self.lr * p.grad / (torch.sqrt(self.v[id]) + 1e-7) class RMSprop: def __init__(self, model, lr=0.001, decay_rate = 0.99): self.lr = lr self.decay_rate = decay_rate self.model = model self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.v[id] = self.decay_rate*self.v[id] + (1 - self.decay_rate) * p.grad * p.grad p -= self.lr * p.grad / (torch.sqrt(self.v[id]) + 1e-7) class AdaDelta: def __init__(self, model, lr=0.001, decay_rate = 0.99): self.lr = lr self.decay_rate = decay_rate self.model = model self.u = {} self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.u[id] = torch.ones_like(p).to(device) self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.v[id] = self.decay_rate * self.v[id] + (1 - self.decay_rate) * p.grad * p.grad delta_w = (torch.sqrt(self.u[id] + 1e-7)/torch.sqrt(self.v[id] + 1e-7)) * p.grad self.u[id] = self.decay_rate*self.u[id] + (1 - self.decay_rate) * delta_w * delta_w p -= self.lr *delta_w class Adam: """(http://arxiv.org/abs/1412.6980v8)""" def __init__(self, model, lr=0.001, beta1=0.9, beta2=0.999): self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.iter = 0 self.model = model self.m = {} self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.m[id] = torch.zeros_like(p).to(device) self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): self.iter += 1 with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.m[id] = self.beta1 * self.m[id] + (1 - self.beta1) * p.grad self.v[id] = self.beta2 * self.v[id] + (1 - self.beta2) * (p.grad**2) m_hat = self.m[id] / (1 - self.beta1**self.iter) v_hat = self.v[id] / (1 - self.beta2**self.iter) p -= self.lr * m_hat / (torch.sqrt(v_hat) + 1e-7) class AdamW: def __init__(self, model, lr=0.001, beta1=0.9, beta2=0.999, weight_decay = 0.01): self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.weight_decay = weight_decay self.iter = 0 self.model = model self.m = {} self.v = {} for id, p in enumerate(model.parameters()): if p.requires_grad: self.m[id] = torch.zeros_like(p).to(device) self.v[id] = torch.zeros_like(p).to(device) def zero_grad(self): self.model.zero_grad() def step(self): self.iter += 1 with torch.no_grad(): for id, p in enumerate(self.model.parameters()): if p.requires_grad: self.m[id] = self.beta1 * self.m[id] + (1 - self.beta1) * p.grad self.v[id] = self.beta2 * self.v[id] + (1 - self.beta2) * (p.grad**2) m_hat = self.m[id] / (1 - self.beta1**self.iter) v_hat = self.v[id] / (1 - self.beta2**self.iter) p -= self.lr *(m_hat / (torch.sqrt(v_hat) + 1e-7) + self.weight_decay*p) class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() self.conv1 = nn.Conv2d(1, 6, kernel_size=5) self.conv2 = nn.Conv2d(6, 16, kernel_size=5) self.fc1 = nn.Linear(16*4*4, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = torch.relu(self.conv1(x)) x = torch.max_pool2d(x, 2) x = torch.relu(self.conv2(x)) x = torch.max_pool2d(x, 2) x = x.view(x.size(0), -1) x = torch.relu(self.fc1(x)) x = torch.relu(self.fc2(x)) x = self.fc3(x) return x trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor()) train_loader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True) model = LeNet() criterion = nn.CrossEntropyLoss() #opt = SGD(model,0.001) #opt = Momentum(model,0.001,0.9) #opt = Nesterov(model,0.001,0.9) #opt = AdaGrad(model,0.001) #opt = RMSprop(model,0.001,0.99) #opt = AdaDelta(model,0.001,0.99) #opt = Adam(model, 0.001, 0.9, 0.999) opt = AdamW(model, 0.001, 0.9, 0.999, 0.01) model.to(device) num_epochs = 100 for epoch in range(num_epochs): model.train() running_loss = 0.0 correct = 0 total = 0 for images, labels in train_loader: images = images.to(device) labels = labels.to(device) output = model(images) loss = criterion(output, labels) opt.zero_grad() loss.backward() opt.step() running_loss += loss.item() _, predicted = torch.max(output.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {(100 * correct / total):.2f}%")
参考:
手写深度学习之优化器(SGD、Momentum、Nesterov、AdaGrad、RMSProp、Adam)_手写优化器-CSDN博客
Adam和AdamW的区别_adam adamw-CSDN博客
标签:优化,self,torch,学习,lr,深度,model,grad,id From: https://www.cnblogs.com/tiandsp/p/18575046