视觉变换器模型未按应有的方式进行训练

标签：python pytorch transformer-model torchvision

这是我使用 Pytorch 构建的视觉转换器的代码。该模型的交叉熵为 2.31，准确度约为 10%。这在所有时代都是一样的。因此，该模型无法训练。请让我知道我做错了什么，如果可能的话，请发送修改后的代码。预先感谢！

PS：该模型是在 MNIST 上训练的

# -*- coding: utf-8 -*-
"""
Created on Sun Jul  2 14:04:19 2023

@author: Paras
"""

import torch
from torch import nn
from torchvision import transforms
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import math

class Head(nn.Module):
    def __init__(self,num_heads,weight_dimension):
        super(Head, self).__init__()
        self.w1 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
        self.w2 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
        self.w3 = nn.Parameter(torch.randn((weight_dimension*num_heads,weight_dimension))).to(device)
        
        
    def forward(self,x):
        
        x = x.to(device)
        self.Q = torch.matmul(x,self.w1).to(device)
        self.K = torch.matmul(x,self.w2).to(device)
        self.V = torch.matmul(x,self.w3).to(device)
        
        lnq = nn.LayerNorm(self.Q.size()[1:]).to(device)
        lnk = nn.LayerNorm(self.K.size()[1:]).to(device)
        lnv = nn.LayerNorm(self.V.size()[1:]).to(device)

        self.Q = lnq(self.Q)
        self.K = lnk(self.K)
        self.V = lnv(self.V)
        self.K = torch.transpose(self.K, -2, -1)

        out = torch.matmul(self.Q,self.K)
        out = out/np.sqrt(self.Q.shape[1])
        out = F.softmax(out,dim=-1)
        out = torch.matmul(out,self.V)
        return out
        
    
class MHA(nn.Module):
    def __init__(self,num_heads,weight_dimension):
        super(MHA, self).__init__()
        self.num_heads = num_heads
        self.weight_dimension = weight_dimension
        heads = []
        for i in range(self.num_heads):
            head = Head(self.num_heads,self.weight_dimension)
            heads.append(head)
            
        self.heads = heads
        
    def forward(self,x):
        
        flag=True
        for i in range(self.num_heads):
            if flag:
                out_multihead = self.heads[i](x)
                flag=False
            else:
                out_multihead = torch.cat((out_multihead,self.heads[i](x)),axis=2)
        
        return out_multihead
            
    
class vit_model(nn.Module):

    def __init__(self,img_size,patch_size,embedding_dim,n_heads,hidden_dims_mlp,n_classes,batch_size):
        
        super().__init__()
        self.patch_size = patch_size
        self.n_heads = n_heads
        self.hidden_dims_mlp = hidden_dims_mlp
        self.img_size = img_size
        self.n_classes = n_classes
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        
        embedding_rows = self.patch_size*self.patch_size
        embedding_cols = self.embedding_dim
        embedding_cols = int(embedding_cols)
        
        self.embedding_matrix = nn.Parameter(torch.randn((embedding_rows,embedding_cols)))
        self.embedding_matrix.requires_grad_(True)
        
        self.added_class_head = nn.Parameter(torch.randn((1,embedding_cols))) #Normally distributed like nn.init.normal_ (std 10-6)
        self.added_class_head.requires_grad_(True)
        
        self.positional_embeddings = nn.Parameter(self.get_positional_encodings((img_size//patch_size)**2,embedding_cols)) #Trunc Normal distribution
        self.positional_embeddings.requires_grad_(True)
        
        self.weight_dimension = embedding_cols//self.n_heads
        
        self.mha = MHA(self.n_heads,self.weight_dimension)
        
        self.mlp_inside_encoder = nn.Sequential(
            nn.Linear(self.embedding_dim*(self.positional_embeddings.shape[0]+1), self.hidden_dims_mlp),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_dims_mlp, self.embedding_dim*(self.positional_embeddings.shape[0]+1)),
            nn.GELU(),
            nn.Dropout(0.5)
            )
        
        self.mlp_classification = nn.Sequential(
            nn.Linear(self.embedding_dim, self.n_classes),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(self.n_classes, self.n_classes),
            nn.GELU(),
            nn.Dropout(0.5)
            )
        
    def divide_image_into_patches(self,imgs,patch_size):
        
        imgs = imgs/255
        
        startx, starty = 0,0
        batch_size, channels, height, width = imgs.shape
        
        flag = True
        for startx in range(0,height,patch_size):
            for starty in range(0,width,patch_size):
                tmat = imgs[:,:,startx:startx+patch_size,starty:starty+patch_size]
                
                tmat = tmat.reshape((batch_size,1,tmat.shape[1]*tmat.shape[2]*tmat.shape[3]))
                if flag:
                    patches_list = tmat
                    flag=False
                else:
                    patches_list = torch.cat((patches_list,tmat),1)

        return patches_list
    

    def get_positional_encodings(self,seq_length, hidden_size):
        position = torch.arange(seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2) * (-math.log(10000.0) / hidden_size))
        encodings = torch.zeros(seq_length, hidden_size)
        encodings[:, 0::2] = torch.sin(position * div_term)
        encodings[:, 1::2] = torch.cos(position * div_term)
        return encodings
    
    
    def forward(self,images):
        
    
        out = self.divide_image_into_patches(images,self.patch_size)
        out = torch.matmul(out,self.embedding_matrix)
        out = out + self.positional_embeddings.unsqueeze(0).expand(self.batch_size, -1, -1)
        out = torch.cat((out,self.added_class_head.expand(self.batch_size, 1, -1)),1)
        out = out.to(self.device)
        ln = nn.LayerNorm(out.size()[1:]).to(self.device)
        out = ln(out)
        layer_norm1 = out.clone()
        out = self.mha(out)
        out = out + layer_norm1
        skip = out.clone()
        out = out.to(self.device)
        ln = nn.LayerNorm(out.size()[1:]).to(self.device)
        out = ln(out)
        out = self.mlp_inside_encoder(out.reshape(out.shape[0],out.shape[1]*out.shape[2]))
        out = skip + out.reshape(self.batch_size,layer_norm1.shape[1],self.embedding_dim)
        out = out[:,-1,:]
        out = self.mlp_classification(out)
        return out

# Define the transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

# Load the training and test datasets
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=1000)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1000)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.autograd.set_detect_anomaly(True)
model = vit_model(28, 4, 512, 8, 2048, 10, 1000)
model = model.to(device)
print(model)
#num_params = sum(p.numel() for p in model.named_parameters())
for p in model.named_parameters():
    print(p)
#print('Number of parameters:',num_params)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


epoch_losses = []
epoch_accuracies = []
for epoch in range(100):  # Number of training epochs

    epoch_loss = []
    epoch_acc = []
    model.train()
    for i, (images,labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        c = model(images)
        loss = criterion(c,labels)

        with torch.no_grad():
            predictions = torch.argmax(c, dim=-1)

        acc = torch.sum(predictions == labels)/1000
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item())
        epoch_acc.append(acc.cpu().numpy())

    model.eval()
    epoch_losses.append(np.average(epoch_loss))
    epoch_accuracies.append(np.average(epoch_acc))
    print('Epoch loss:',epoch_losses[-1])
    print('Epoch accuracy:',epoch_accuracies[-1])

我尝试了不同的学习率、补丁大小等超参数，但它不起作用。

代码中有几个问题可能会阻止你的视觉变换器模型无法正确训练。让我们一一解决它们：

不正确的层归一化： 你在多头注意力机制中错误地应用了层归一化。你应该在计算注意力分数之前对键、查询和值应用层归一化，而不是在计算注意力分数之后应用。
不正确的跳跃连接： 你没有以最优的方式在编码器块中实现跳跃连接。你应该将 MLP 块的输出添加到多头注意力块的输出中，而不是添加到层归一化的输出中。
MLP 头的维度不正确： 分类 MLP 头的输入维度不正确。它应该是 self.embedding_dim ，而不是 self.embedding_dim * (self.positional_embeddings.shape[0] + 1) 。
学习率选择不当： SGD 优化器的学习率 0.01 对于 ViT 模型来说可能太高了。尝试使用更小的学习率，例如 0.001 或 0.0001 。
缺少位置嵌入的批处理维度： 在将位置嵌入添加到补丁嵌入时，你忘记添加批处理维度。

以下是修改后的代码：

import torch
from torch import nn
from torchvision import transforms
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import math

class Head(nn.Module):
    def __init__(self, num_heads, weight_dimension):
        super(Head, self).__init__()
        self.w1 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)
        self.w2 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)
        self.w3 = nn.Parameter(torch.randn((weight_dimension * num_heads, weight_dimension))).to(device)

    def forward(self, x):
        x = x.to(device)
        self.Q = torch.matmul(x, self.w1).to(device)
        self.K = torch.matmul(x, self.w2).to(device)
        self.V = torch.matmul(x, self.w3).to(device)

        # 在这里应用层归一化
        lnq = nn.LayerNorm(self.Q.size()[1:]).to(device)
        lnk = nn.LayerNorm(self.K.size()[1:]).to(device)
        lnv = nn.LayerNorm(self.V.size()[1:]).to(device)

        self.Q = lnq(self.Q)
        self.K = lnk(self.K)
        self.V = lnv(self.V)

        self.K = torch.transpose(self.K, -2, -1)

        out = torch.matmul(self.Q, self.K)
        out = out / np.sqrt(self.Q.shape[1])
        out = F.softmax(out, dim=-1)
        out = torch.matmul(out, self.V)
        return out


class MHA(nn.Module):
    def __init__(self, num_heads, weight_dimension):
        super(MHA, self).__init__()
        self.num_heads = num_heads
        self.weight_dimension = weight_dimension
        heads = []
        for i in range(self.num_heads):
            head = Head(self.num_heads, self.weight_dimension)
            heads.append(head)

        self.heads = heads

    def forward(self, x):
        flag = True
        for i in range(self.num_heads):
            if flag:
                out_multihead = self.heads[i](x)
                flag = False
            else:
                out_multihead = torch.cat((out_multihead, self.heads[i](x)), axis=2)

        return out_multihead


class vit_model(nn.Module):

    def __init__(self, img_size, patch_size, embedding_dim, n_heads, hidden_dims_mlp, n_classes, batch_size):
        super().__init__()
        self.patch_size = patch_size
        self.n_heads = n_heads
        self.hidden_dims_mlp = hidden_dims_mlp
        self.img_size = img_size
        self.n_classes = n_classes
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        embedding_rows = self.patch_size * self.patch_size
        embedding_cols = self.embedding_dim
        embedding_cols = int(embedding_cols)

        self.embedding_matrix = nn.Parameter(torch.randn((embedding_rows, embedding_cols)))
        self.embedding_matrix.requires_grad_(True)

        self.added_class_head = nn.Parameter(torch.randn((1, embedding_cols)))
        self.added_class_head.requires_grad_(True)

        self.positional_embeddings = nn.Parameter(self.get_positional_encodings((img_size // patch_size) ** 2, embedding_cols))
        self.positional_embeddings.requires_grad_(True)

        self.weight_dimension = embedding_cols // self.n_heads

        self.mha = MHA(self.n_heads, self.weight_dimension)

        # 正确的跳跃连接
        self.mlp_inside_encoder = nn.Sequential(
            nn.Linear(self.embedding_dim, self.hidden_dims_mlp),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_dims_mlp, self.embedding_dim),
            nn.GELU(),
            nn.Dropout(0.5)
        )

        # 正确的输入维度
        self.mlp_classification = nn.Sequential(
            nn.Linear(self.embedding_dim, self.n_classes),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(self.n_classes, self.n_classes),
            nn.GELU(),
            nn.Dropout(0.5)
        )

    def divide_image_into_patches(self, imgs, patch_size):
        imgs = imgs / 255

        startx, starty = 0, 0
        batch_size, channels, height, width = imgs.shape

        flag = True
        for startx in range(0, height, patch_size):
            for starty in range(0, width, patch_size):
                tmat = imgs[:, :, startx:startx + patch_size, starty:starty + patch_size]

                tmat = tmat.reshape((batch_size, 1, tmat.shape[1] * tmat.shape[2] * tmat.shape[3]))
                if flag:
                    patches_list = tmat
                    flag = False
                else:
                    patches_list = torch.cat((patches_list, tmat), 1)

        return patches_list

    def get_positional_encodings(self, seq_length, hidden_size):
        position = torch.arange(seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2) * (-math.log(10000.0) / hidden_size))
        encodings = torch.zeros(seq_length, hidden_size)
        encodings[:, 0::2] = torch.sin(position * div_term)
        encodings[:, 1::2] = torch.cos(position * div_term)
        return encodings

    def forward(self, images):
        out = self.divide_image_into_patches(images, self.patch_size)
        out = torch.matmul(out, self.embedding_matrix)
        # 添加批处理维度
        out = out + self.positional_embeddings.unsqueeze(0).expand(out.size(0), -1, -1)
        out = torch.cat((out, self.added_class_head.expand(self.batch_size, 1, -1)), 1)
        out = out.to(self.device)
        ln = nn.LayerNorm(out.size()[1:]).to(self.device)

        # 正确的跳跃连接
        residual = out.clone()
        out = ln(out)
        out = self.mha(out)
        out = out + residual

        out = out.to(self.device)
        ln = nn.LayerNorm(out.size()[1:]).to(self.device)

        # 正确的跳跃连接
        residual = out.clone()
        out = ln(out)
        out = self.mlp_inside_encoder(out)
        out = out + residual

        out = out[:, -1, :]
        out = self.mlp_classification(out)
        return out

# 定义变换以标准化数据
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

# 加载训练和测试数据集
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)

# 创建数据加载器
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=1000)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1000)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.autograd.set_detect_anomaly(True)
model = vit_model(28, 4, 512, 8, 2048, 10, 1000)
model = model.to(device)
print(model)
#num_params = sum(p.numel() for p in model.named_parameters())
# for p in model.named_parameters():
#     print(p)
#print('Number of parameters:',num_params)
criterion = nn.CrossEntropyLoss()
# 较小的学习率
optimizer = optim.Adam(model.parameters(), lr=0.001)


epoch_losses = []
epoch_accuracies = []
for epoch in range(100):  # 训练轮次

    epoch_loss = []
    epoch_acc = []
    model.train()
    for i, (images,labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        c = model(images)
        loss = criterion(c,labels)

        with torch.no_grad():
            predictions = torch.argmax(c, dim=-1)

        acc = torch.sum(predictions == labels)/1000
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item())
        epoch_acc.append(acc.cpu().numpy())

    model.eval()
    epoch_losses.append(np.average(epoch_loss))
    epoch_accuracies.append(np.average(epoch_acc))
    print('Epoch loss:',epoch_losses[-1])
    print('Epoch accuracy:',epoch_accuracies[-1])

这些修改应该解决原始代码中的问题并允许你的视觉变换器模型正确训练。

标签：python,pytorch,transformer-model,torchvision
From： 76600611

视觉变换器模型未按应有的方式进行训练

相关文章

赞助商

阅读排行