这是我使用 Pytorch 构建的视觉转换器的代码。该模型的交叉熵为 2.31,准确度约为 10%。这在所有时代都是一样的。因此,该模型无法训练。请让我知道我做错了什么,如果可能的话,请发送修改后的代码。预先感谢!
PS:该模型是在 MNIST 上训练的
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 2 14:04:19 2023
@author: Paras
"""
import torch
from torch import nn
from torchvision import transforms
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import math
class Head(nn.Module):
def __init__(self,num_heads,weight_dimension):
super(Head, self).__init__()
self.w1 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
self.w2 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
self.w3 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
def forward(self,x):
x = x.to(device)
self.Q = torch.matmul(x,self.w1).to(device)
self.K = torch.matmul(x,self.w2).to(device)
self.V = torch.matmul(x,self.w3).to(device)
lnq = nn.LayerNorm(self.Q.size()[1:]).to(device)
lnk = nn.LayerNorm(self.K.size()[1:]).to(device)
lnv = nn.LayerNorm(self.V.size()[1:]).to(device)
self.Q = lnq(self.Q)
self.K = lnk(self.K)
self.V = lnv(self.V)
self.K = torch.transpose(self.K, -2, -1)
out = torch.matmul(self.Q,self.K)
out = out/np.sqrt(self.Q.shape[1])
out = F.softmax(out,dim=-1)
out = torch.matmul(out,self.V)
return out
class MHA(nn.Module):
def __init__(self,num_heads,weight_dimension):
super(MHA, self).__init__()
self.num_heads = num_heads
self.weight_dimension = weight_dimension
heads = []
for i in range(self.num_heads):
head = Head(self.num_heads,self.weight_dimension)
heads.append(head)
self.heads = heads
def forward(self,x):
flag=True
for i in range(self.num_heads):
if flag:
out_multihead = self.heads[i](x)
flag=False
else:
out_multihead = torch.cat((out_multihead,self.heads[i](x)),axis=2)
return out_multihead
class vit_model(nn.Module):
def __init__(self,img_size,patch_size,embedding_dim,n_heads,hidden_dims_mlp,n_classes,batch_size):
super().__init__()
self.patch_size = patch_size
self.n_heads = n_heads
self.hidden_dims_mlp = hidden_dims_mlp
self.img_size = img_size
self.n_classes = n_classes
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.embedding_dim = embedding_dim
self.batch_size = batch_size
embedding_rows = self.patch_size*self.patch_size
embedding_cols = self.embedding_dim
embedding_cols = int(embedding_cols)
self.embedding_matrix = nn.Parameter(torch.randn((embedding_rows,embedding_cols)))
self.embedding_matrix.requires_grad_(True)
self.added_class_head = nn.Parameter(torch.randn((1,embedding_cols))) #Normally distributed like nn.init.normal_ (std 10-6)
self.added_class_head.requires_grad_(True)
self.positional_embeddings = nn.Parameter(self.get_positional_encodings((img_size//patch_size)**2,embedding_cols)) #Trunc Normal distribution
self.positional_embeddings.requires_grad_(True)
self.weight_dimension = embedding_cols//self.n_heads
self.mha = MHA(self.n_heads,self.weight_dimension)
self.mlp_inside_encoder = nn.Sequential(
nn.Linear(self.embedding_dim*(self.positional_embeddings.shape[0]+1), self.hidden_dims_mlp),
nn.GELU(),
nn.Dropout(0.5),
nn.Linear(self.hidden_dims_mlp, self.embedding_dim*(self.positional_embeddings.shape[0]+1)),
nn.GELU(),
nn.Dropout(0.5)
)
self.mlp_classification = nn.Sequential(
nn.Linear(self.embedding_dim, self.n_classes),
nn.GELU(),
nn.Dropout(0.5),
nn.Linear(self.n_classes, self.n_classes),
nn.GELU(),
nn.Dropout(0.5)
)
def divide_image_into_patches(self,imgs,patch_size):
imgs = imgs/255
startx, starty = 0,0
batch_size, channels, height, width = imgs.shape
flag = True
for startx in range(0,height,patch_size):
for starty in range(0,width,patch_size):
tmat = imgs[:,:,startx:startx+patch_size,starty:starty+patch_size]
tmat = tmat.reshape((batch_size,1,tmat.shape[1]*tmat.shape[2]*tmat.shape[3]))
if flag:
patches_list = tmat
flag=False
else:
patches_list = torch.cat((patches_list,tmat),1)
return patches_list
def get_positional_encodings(self,seq_length, hidden_size):
position = torch.arange(seq_length).unsqueeze(1)
div_term = torch.exp(torch.arange(0, hidden_size, 2) * (-math.log(10000.0) / hidden_size))
encodings = torch.zeros(seq_length, hidden_size)
encodings[:, 0::2] = torch.sin(position * div_term)
encodings[:, 1::2] = torch.cos(position * div_term)
return encodings
def forward(self,images):
out = self.divide_image_into_patches(images,self.patch_size)
out = torch.matmul(out,self.embedding_matrix)
out = out + self.positional_embeddings.unsqueeze(0).expand(self.batch_size, -1, -1)
out = torch.cat((out,self.added_class_head.expand(self.batch_size, 1, -1)),1)
out = out.to(self.device)
ln = nn.LayerNorm(out.size()[1:]).to(self.device)
out = ln(out)
layer_norm1 = out.clone()
out = self.mha(out)
out = out + layer_norm1
skip = out.clone()
out = out.to(self.device)
ln = nn.LayerNorm(out.size()[1:]).to(self.device)
out = ln(out)
out = self.mlp_inside_encoder(out.reshape(out.shape[0],out.shape[1]*out.shape[2]))
out = skip + out.reshape(self.batch_size,layer_norm1.shape[1],self.embedding_dim)
out = out[:,-1,:]
out = self.mlp_classification(out)
return out
# Define the transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
# Load the training and test datasets
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)
# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=1000)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1000)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)
model = vit_model(28, 4, 512, 8, 2048, 10, 1000)
model = model.to(device)
print(model)
#num_params = sum(p.numel() for p in model.named_parameters())
for p in model.named_parameters():
print(p)
#print('Number of parameters:',num_params)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
epoch_losses = []
epoch_accuracies = []
for epoch in range(100): # Number of training epochs
epoch_loss = []
epoch_acc = []
model.train()
for i, (images,labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
c = model(images)
loss = criterion(c,labels)
with torch.no_grad():
predictions = torch.argmax(c, dim=-1)
acc = torch.sum(predictions == labels)/1000
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss.append(loss.item())
epoch_acc.append(acc.cpu().numpy())
model.eval()
epoch_losses.append(np.average(epoch_loss))
epoch_accuracies.append(np.average(epoch_acc))
print('Epoch loss:',epoch_losses[-1])
print('Epoch accuracy:',epoch_accuracies[-1])
我尝试了不同的学习率、补丁大小等超参数,但它不起作用。
代码中有几个问题可能会阻止你的视觉变换器模型无法正确训练。让我们一一解决它们:
-
不正确的层归一化: 你在多头注意力机制中错误地应用了层归一化。 你应该在计算注意力分数 之前 对键、查询和值应用层归一化,而不是在计算注意力分数 之后 应用。
-
不正确的跳跃连接: 你没有以最优的方式在编码器块中实现跳跃连接。 你应该将 MLP 块的输出添加到多头注意力块的输出中,而不是添加到层归一化的输出中。
-
MLP 头的维度不正确: 分类 MLP 头的输入维度不正确。 它应该是
self.embedding_dim
,而不是self.embedding_dim * (self.positional_embeddings.shape[0] + 1)
。 -
学习率选择不当: SGD 优化器的学习率
0.01
对于 ViT 模型来说可能太高了。 尝试使用更小的学习率,例如0.001
或0.0001
。 -
缺少位置嵌入的批处理维度: 在将位置嵌入添加到补丁嵌入时,你忘记添加批处理维度。
以下是修改后的代码:
import torch
from torch import nn
from torchvision import transforms
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import math
class Head(nn.Module):
def __init__(self, num_heads, weight_dimension):
super(Head, self).__init__()
self.w1 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)
self.w2 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)
self.w3 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)
def forward(self, x):
x = x.to(device)
self.Q = torch.matmul(x, self.w1).to(device)
self.K = torch.matmul(x, self.w2).to(device)
self.V = torch.matmul(x, self.w3).to(device)
# 在这里应用层归一化
lnq = nn.LayerNorm(self.Q.size()[1:]).to(device)
lnk = nn.LayerNorm(self.K.size()[1:]).to(device)
lnv = nn.LayerNorm(self.V.size()[1:]).to(device)
self.Q = lnq(self.Q)
self.K = lnk(self.K)
self.V = lnv(self.V)
self.K = torch.transpose(self.K, -2, -1)
out = torch.matmul(self.Q, self.K)
out = out / np.sqrt(self.Q.shape[1])
out = F.softmax(out, dim=-1)
out = torch.matmul(out, self.V)
return out
class MHA(nn.Module):
def __init__(self, num_heads, weight_dimension):
super(MHA, self).__init__()
self.num_heads = num_heads
self.weight_dimension = weight_dimension
heads = []
for i in range(self.num_heads):
head = Head(self.num_heads, self.weight_dimension)
heads.append(head)
self.heads = heads
def forward(self, x):
flag = True
for i in range(self.num_heads):
if flag:
out_multihead = self.heads[i](x)
flag = False
else:
out_multihead = torch.cat((out_multihead, self.heads[i](x)), axis=2)
return out_multihead
class vit_model(nn.Module):
def __init__(self, img_size, patch_size, embedding_dim, n_heads, hidden_dims_mlp, n_classes, batch_size):
super().__init__()
self.patch_size = patch_size
self.n_heads = n_heads
self.hidden_dims_mlp = hidden_dims_mlp
self.img_size = img_size
self.n_classes = n_classes
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.embedding_dim = embedding_dim
self.batch_size = batch_size
embedding_rows = self.patch_size * self.patch_size
embedding_cols = self.embedding_dim
embedding_cols = int(embedding_cols)
self.embedding_matrix = nn.Parameter(torch.randn((embedding_rows, embedding_cols)))
self.embedding_matrix.requires_grad_(True)
self.added_class_head = nn.Parameter(torch.randn((1, embedding_cols)))
self.added_class_head.requires_grad_(True)
self.positional_embeddings = nn.Parameter(self.get_positional_encodings((img_size // patch_size) ** 2, embedding_cols))
self.positional_embeddings.requires_grad_(True)
self.weight_dimension = embedding_cols // self.n_heads
self.mha = MHA(self.n_heads, self.weight_dimension)
# 正确的跳跃连接
self.mlp_inside_encoder = nn.Sequential(
nn.Linear(self.embedding_dim, self.hidden_dims_mlp),
nn.GELU(),
nn.Dropout(0.5),
nn.Linear(self.hidden_dims_mlp, self.embedding_dim),
nn.GELU(),
nn.Dropout(0.5)
)
# 正确的输入维度
self.mlp_classification = nn.Sequential(
nn.Linear(self.embedding_dim, self.n_classes),
nn.GELU(),
nn.Dropout(0.5),
nn.Linear(self.n_classes, self.n_classes),
nn.GELU(),
nn.Dropout(0.5)
)
def divide_image_into_patches(self, imgs, patch_size):
imgs = imgs / 255
startx, starty = 0, 0
batch_size, channels, height, width = imgs.shape
flag = True
for startx in range(0, height, patch_size):
for starty in range(0, width, patch_size):
tmat = imgs[:, :, startx:startx + patch_size, starty:starty + patch_size]
tmat = tmat.reshape((batch_size, 1, tmat.shape[1] * tmat.shape[2] * tmat.shape[3]))
if flag:
patches_list = tmat
flag = False
else:
patches_list = torch.cat((patches_list, tmat), 1)
return patches_list
def get_positional_encodings(self, seq_length, hidden_size):
position = torch.arange(seq_length).unsqueeze(1)
div_term = torch.exp(torch.arange(0, hidden_size, 2) * (-math.log(10000.0) / hidden_size))
encodings = torch.zeros(seq_length, hidden_size)
encodings[:, 0::2] = torch.sin(position * div_term)
encodings[:, 1::2] = torch.cos(position * div_term)
return encodings
def forward(self, images):
out = self.divide_image_into_patches(images, self.patch_size)
out = torch.matmul(out, self.embedding_matrix)
# 添加批处理维度
out = out + self.positional_embeddings.unsqueeze(0).expand(out.size(0), -1, -1)
out = torch.cat((out, self.added_class_head.expand(self.batch_size, 1, -1)), 1)
out = out.to(self.device)
ln = nn.LayerNorm(out.size()[1:]).to(self.device)
# 正确的跳跃连接
residual = out.clone()
out = ln(out)
out = self.mha(out)
out = out + residual
out = out.to(self.device)
ln = nn.LayerNorm(out.size()[1:]).to(self.device)
# 正确的跳跃连接
residual = out.clone()
out = ln(out)
out = self.mlp_inside_encoder(out)
out = out + residual
out = out[:, -1, :]
out = self.mlp_classification(out)
return out
# 定义变换以标准化数据
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
# 加载训练和测试数据集
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)
# 创建数据加载器
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=1000)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1000)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)
model = vit_model(28, 4, 512, 8, 2048, 10, 1000)
model = model.to(device)
print(model)
#num_params = sum(p.numel() for p in model.named_parameters())
# for p in model.named_parameters():
# print(p)
#print('Number of parameters:',num_params)
criterion = nn.CrossEntropyLoss()
# 较小的学习率
optimizer = optim.Adam(model.parameters(), lr=0.001)
epoch_losses = []
epoch_accuracies = []
for epoch in range(100): # 训练轮次
epoch_loss = []
epoch_acc = []
model.train()
for i, (images,labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
c = model(images)
loss = criterion(c,labels)
with torch.no_grad():
predictions = torch.argmax(c, dim=-1)
acc = torch.sum(predictions == labels)/1000
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss.append(loss.item())
epoch_acc.append(acc.cpu().numpy())
model.eval()
epoch_losses.append(np.average(epoch_loss))
epoch_accuracies.append(np.average(epoch_acc))
print('Epoch loss:',epoch_losses[-1])
print('Epoch accuracy:',epoch_accuracies[-1])
这些修改应该解决原始代码中的问题并允许你的视觉变换器模型正确训练。
标签:python,pytorch,transformer-model,torchvision From: 76600611