首页 > 编程问答 >为什么模型中的梯度参数全部都是None?

为什么模型中的梯度参数全部都是None?

时间:2024-07-30 17:11:47浏览次数:21  
标签:python machine-learning pytorch conv-neural-network semantic-segmentation

这是我的模型。 我的计划是制作一个融合相机和雷达图像的模型。

class FusionNet(nn.Module):
    def __init__(self, radar_channels=1, camera_channels=3, n_classes=2, bilinear=False):
        super(FusionNet, self).__init__()
        self.radar_channels = radar_channels
        self.camera_channels = camera_channels
        self.n_classes = n_classes
        self.bilinear = bilinear
        factor = 2 if bilinear else 1

        # Encoder for radar images
        self.enc0_radar = DoubleConv(radar_channels, radar_channels)
        self.enc1_radar = Encoder(radar_channels, 8 // factor)
        self.enc2_radar = Encoder(8, 16 // factor)
        self.enc3_radar = Encoder(16, 32 // factor)

        # Encoder for camera images
        self.enc0_camera = DoubleConv(camera_channels, camera_channels)
        self.enc1_camera = Encoder(camera_channels, 8 // factor)
        self.enc2_camera = Encoder(8, 16 // factor)
        self.enc3_camera = Encoder(16, 32 // factor)

        # Fusion
        self.fusion0 = Fusion()

        # Decoder
        self.dec1 = Decoder(32, 16 // factor, bilinear)
        self.fusion1 = Fusion()
        self.dec2 = Decoder(48, 8 // factor, bilinear)
        self.fusion2 = Fusion()
        self.dec3 = Decoder(24, n_classes, bilinear)
        self.outc = OutConv(n_classes, n_classes)

    def forward(self, radar_img, camera_img):
        
        print(f"radar: {radar_img.size()},camera: {camera_img.size()}")
        # Radar image encoding
        x0_radar = self.enc0_radar(radar_img) #
        x1_radar = self.enc1_radar(x0_radar)  # 1->8 channel    (1,210,70)->(8,105,35)
        x2_radar = self.enc2_radar(x1_radar)  # 8->16 channel   (8,105,35)->(16,52,17)
        x3_radar = self.enc3_radar(x2_radar)  # 16->32 channel  (16,52,17)->(32,26,8)

        # Camera image encoding
        x0_camera = self.enc0_camera(camera_img)  #
        x1_camera = self.enc1_camera(x0_camera)  # 3->8 channel   (3,210,70)->(8,105,35)
        x2_camera = self.enc2_camera(x1_camera)  # 8->16 channel  (8,105,35)->(16,52,17)
        x3_camera = self.enc3_camera(x2_camera)  # 16->32 channel (16,52,17)->(32,26,8)

        # Fusion
        # x3_fusion = self.fusion0(x3_radar, x3_camera)
        x3_fusion  = x3_radar + x3_camera

        # Decoding 1
        upsample = UpsampleWithExactSize((52, 17))
        x2_dec = upsample(self.dec1(x3_fusion))           # 
        x2_fusion = self.fusion1(x2_radar, x2_camera, x2_dec)
        
        # Decoding 2
        upsample = UpsampleWithExactSize((105, 35))
        x1_dec = upsample(self.dec2(x2_fusion))
        x1_fusion = self.fusion2(x1_radar, x1_camera, x1_dec)

        # Decoding 3
        upsample = UpsampleWithExactSize((210, 70))
        x0_dec = upsample(self.dec3(x1_fusion))

        # Ofefwut
        ogm = self.outc(x0_dec)  # Final output
        return ogm

class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()

        if not mid_channels:
            mid_channels = out_channels

        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):
        return self.double_conv(x)


class Fusion(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x0_rad, x0_cam, x0=None):
        if x0 is not None:
            x = torch.cat([x0_rad, x0_cam, x0], dim=1)
        else:
            x = torch.cat([x0_rad, x0_cam], dim=1)

        return x


""" Downscaling w/ maxpool & double conv. """
class Encoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(nn.MaxPool2d(2), DoubleConv(in_channels, out_channels))
       
    def forward(self, x):
        return self.maxpool_conv(x)


""" Upscaling & double conv. """
class Decoder(nn.Module):
    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
            self.conv = DoubleConv(in_channels // 2, out_channels, in_channels // 2)   
        else:
            self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2) 
            self.conv = DoubleConv(in_channels // 2, out_channels)                               

    def forward(self, x0):
        x = self.up(x0)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
       
    def forward(self, x):
        x1 = self.conv(x)
        return x1


class DownsampleConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2, padding=0):
        super(DownsampleConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        return self.conv(x)


class UpsampleWithExactSize(nn.Module):
    def __init__(self, output_size):
        super(UpsampleWithExactSize, self).__init__()
        self.output_size = output_size

    def forward(self, x):
        # Up-sample to the exact size
        x_up = F.interpolate(x, size=self.output_size, mode="bilinear", align_corners=True) 
        return x_up

我已经用 required_grad=true 做到了。但我不知道为什么会出现这个错误。 很奇怪...

Gradient for parameter enc0_radar.double_conv.0.weight is None.
Gradient for parameter enc0_radar.double_conv.1.weight is None.
Gradient for parameter enc0_radar.double_conv.1.bias is None.
Gradient for parameter enc0_radar.double_conv.3.weight is None.
Gradient for parameter enc0_radar.double_conv.4.weight is None.
Gradient for parameter enc0_radar.double_conv.4.bias is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc1_radar.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc2_radar.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc3_radar.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter enc0_camera.double_conv.0.weight is None.
Gradient for parameter enc0_camera.double_conv.1.weight is None.
Gradient for parameter enc0_camera.double_conv.1.bias is None.
Gradient for parameter enc0_camera.double_conv.3.weight is None.
Gradient for parameter enc0_camera.double_conv.4.weight is None.
Gradient for parameter enc0_camera.double_conv.4.bias is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc1_camera.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc2_camera.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.0.weight is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.1.weight is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.1.bias is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.3.weight is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.4.weight is None.
Gradient for parameter enc3_camera.maxpool_conv.1.double_conv.4.bias is None.
Gradient for parameter dec1.up.weight is None.
Gradient for parameter dec1.up.bias is None.
Gradient for parameter dec1.conv.double_conv.0.weight is None.
Gradient for parameter dec1.conv.double_conv.1.weight is None.
Gradient for parameter dec1.conv.double_conv.1.bias is None.
Gradient for parameter dec1.conv.double_conv.3.weight is None.
Gradient for parameter dec1.conv.double_conv.4.weight is None.
Gradient for parameter dec1.conv.double_conv.4.bias is None.
Gradient for parameter dec2.up.weight is None.
Gradient for parameter dec2.up.bias is None.
Gradient for parameter dec2.conv.double_conv.0.weight is None.
Gradient for parameter dec2.conv.double_conv.1.weight is None.
Gradient for parameter dec2.conv.double_conv.1.bias is None.
Gradient for parameter dec2.conv.double_conv.3.weight is None.
Gradient for parameter dec2.conv.double_conv.4.weight is None.
Gradient for parameter dec2.conv.double_conv.4.bias is None.
Gradient for parameter dec3.up.weight is None.
Gradient for parameter dec3.up.bias is None.
Gradient for parameter dec3.conv.double_conv.0.weight is None.
Gradient for parameter dec3.conv.double_conv.1.weight is None.
Gradient for parameter dec3.conv.double_conv.1.bias is None.
Gradient for parameter dec3.conv.double_conv.3.weight is None.
Gradient for parameter dec3.conv.double_conv.4.weight is None.
Gradient for parameter dec3.conv.double_conv.4.bias is None.
Gradient for parameter outc.conv.weight is None.
Gradient for parameter outc.conv.bias is None.

是的, train.py 有效,但分数很奇怪。 而且训练结果也很奇怪

我该怎么办?请帮我。 参数的梯度是 None 是什么意思?

这是 train.py 代码。

import argparse
import logging
import os
import random
import sys
from pathlib import Path

import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF
from torch import optim
from torch.utils.data import DataLoader, random_split,TensorDataset
from tqdm import tqdm
from PIL import Image
import wandb
from evaluate import evaluate
from radOGM import FusionNet, radOGM
from utils.data_loading import BasicDataset, CarvanaDataset
from utils.dice_score import dice_loss

dir_radar = Path("./data/radar/")
dir_camera = Path("./data/camera/")
dir_mask = Path("./data/masks/")
dir_checkpoint = Path("./checkpoints/")


def train_model(
    model,
    device,
    epochs: int = 5,
    batch_size: int = 1,
    learning_rate: float = 1e-5,
    val_percent: float = 0.1,
    save_checkpoint: bool = True,
    img_scale: float = 1.0,
    amp: bool = False,
    weight_decay: float = 1e-8,
    momentum: float = 0.999,
    gradient_clipping: float = 1.0,
):

    # 수정사항: dataset 3개: radar, camera, mask===================================
    # 1. Create dataset
    try:
        dataset = CarvanaDataset(dir_radar, dir_camera, dir_mask, img_scale)
    except (AssertionError, RuntimeError) as e:
        print(f"Error creating FusionDataset: {e}")
        return
    # ============================================================================

    # 2. Split into train / validation partitions
    n_val = int(len(dataset) * val_percent)
    n_train = len(dataset) - n_val
    train_set, val_set = random_split(dataset, [n_train, n_val], generator=torch.Generator().manual_seed(0))

    # 3. Create data loaders
    loader_args = dict(batch_size=batch_size, num_workers=os.cpu_count(), pin_memory=True)
    train_loader = DataLoader(train_set, shuffle=True, **loader_args)
    val_loader = DataLoader(val_set, shuffle=False, drop_last=True, **loader_args)

    # (Initialize logging)
    experiment = wandb.init(project="U-Net", resume="allow", anonymous="must")
    experiment.config.update(
        dict(
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            val_percent=val_percent,
            save_checkpoint=save_checkpoint,
            img_scale=img_scale,
            amp=amp,
        )
    )

    logging.info(
        f"""Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_checkpoint}
        Device:          {device.type}
        Images scaling:  {img_scale}
        Mixed Precision: {amp}
    """
    )

    # 4. Set up the optimizer, the loss, the learning rate scheduler and the loss scaling for AMP
    optimizer = optim.RMSprop(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum, foreach=True
    )
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "max", patience=5)  # goal: maximize Dice score
    grad_scaler = torch.cuda.amp.GradScaler(enabled=amp)
    criterion = nn.CrossEntropyLoss() if model.n_classes > 1 else nn.BCEWithLogitsLoss()
    global_step = 0

    # 5. Begin training
    for epoch in range(1, epochs + 1):
        model.train()
        epoch_loss = 0
        with tqdm(total=n_train, desc=f"Epoch {epoch}/{epochs}", unit="img") as pbar:
            for batch in train_loader:
                # =================================================================================
                radar_images, camera_images = batch["radar"], batch["camera"]
                true_masks = batch["mask"]
                # =================================================================================

                assert radar_images.shape[1] == model_instance.radar_channels, (
                    f"Network has been defined with {model_instance.radar_channels} input channels, "
                    f"but loaded images have {radar_images.shape[1]} channels. Please check that "
                    "the images are loaded correctly."
                )

               

                # =================================================================================
                radar_images = radar_images.to(device=device, dtype=torch.float32, memory_format=torch.channels_last)
                camera_images = camera_images.to(device=device, dtype=torch.float32, memory_format=torch.channels_last)
                true_masks = true_masks.to(device=device, dtype=torch.long)

                # radar_images = torch.tensor(radar_images,requires_grad=True)
                # camera_images = torch.tensor(camera_images,requires_grad=True)
                # true_masks = torch.tensor(true_masks,requires_grad=True)
                # =================================================================================

                print(f"Radar image shape: {radar_images.size()}")
                print(f"Camera image shape: {camera_images.size()}")
                print(f"Mask image shape: {true_masks.size()}")

                # torch.autograd.set_detect_anomaly(True)
                with torch.autocast(device.type if device.type != "mps" else "cpu", enabled=amp):
                    # =================================================================================
                    masks_pred = model(radar_images, camera_images)
                 
                    print(f"Model output shape: {masks_pred.size()}")  
                    check_gradients(model)
                    # =================================================================================
                    if model.n_classes == 1:
                        loss = criterion(masks_pred.squeeze(1), true_masks.float())
                        loss += dice_loss(F.sigmoid(masks_pred.squeeze(1)), true_masks.float(), multiclass=False)
                    else:
                        loss = criterion(masks_pred, true_masks)
                        loss += dice_loss(
                            F.softmax(masks_pred, dim=1).float(),
                            F.one_hot(true_masks, model.n_classes).permute(0, 3, 1, 2).float(),
                            multiclass=True,
                        )

                optimizer.zero_grad(set_to_none=True)
                grad_scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping)
                grad_scaler.step(optimizer)
                grad_scaler.update()

                pbar.update(radar_images.shape[0])
                global_step += 1
                epoch_loss += loss.item()
                experiment.log({"train loss": loss.item(), "step": global_step, "epoch": epoch})
                pbar.set_postfix(**{"loss (batch)": loss.item()})

                # Evaluation round
                division_step = n_train // (5 * batch_size)
                if division_step > 0:
                    if global_step % division_step == 0:
                        histograms = {}
                        for tag, value in model.named_parameters():
                            tag = tag.replace("/", ".")

                            if not torch.isinf(value).any():
                                histograms["Weights/" + tag] = wandb.Histogram(value.data.cpu())
                            if not torch.isinf(value.grad).any():
                                histograms["Gradients/" + tag] = wandb.Histogram(value.grad.data.cpu())
                       
                        val_score = evaluate(model, val_loader, device, amp)
                        scheduler.step(val_score)

                        logging.info("Validation Dice score: {}".format(val_score))
                        try:
                            experiment.log(
                                {
                                    "learning rate": optimizer.param_groups[0]["lr"],
                                    "validation Dice": val_score,
                                    "images": wandb.Image(radar_images[0].cpu()),
                                    "masks": {
                                        "true": wandb.Image(true_masks[0].float().cpu()),
                                        "pred": wandb.Image(masks_pred.argmax(dim=1)[0].float().cpu()),
                                    },
                                    "step": global_step,
                                    "epoch": epoch,
                                    **histograms,
                                }
                            )
                        except:
                            pass

        if save_checkpoint:
            Path(dir_checkpoint).mkdir(parents=True, exist_ok=True)
            state_dict = model.state_dict()
            state_dict["mask_values"] = dataset.mask_values
            torch.save(state_dict, str(dir_checkpoint / "checkpoint_epoch{}.pth".format(epoch)))
            logging.info(f"Checkpoint {epoch} saved!")


def get_args():
    parser = argparse.ArgumentParser(description="Train the UNet on images and target masks")
    parser.add_argument("--epochs", "-e", metavar="E", type=int, default=5, help="Number of epochs")
    parser.add_argument("--batch-size", "-b", dest="batch_size", metavar="B", type=int, default=4, help="Batch size")
    parser.add_argument(
        "--learning-rate", "-l", metavar="LR", type=float, default=1e-4, help="Learning rate", dest="lr"
    )
    parser.add_argument("--load", "-f", type=str, default=False, help="Load model from a .pth file")
    parser.add_argument("--scale", "-s", type=float, default=1.0, help="Downscaling factor of the images")
    parser.add_argument(
        "--validation",
        "-v",
        dest="val",
        type=float,
        default=10.0,
        help="Percent of the data that is used as validation (0-100)",
    )
    parser.add_argument("--amp", action="store_true", default=False, help="Use mixed precision")
    parser.add_argument("--bilinear", action="store_true", default=False, help="Use bilinear upsampling")
    parser.add_argument("--classes", "-c", type=int, default=2, help="Number of classes")

    return parser.parse_args()


def check_gradients(model):
    for name, param in model.named_parameters():
        # if param.grad is not None:
        #     print(f"Grad for {name}: {param.grad}")
        if param.grad is None:
            print(f"Gradient for parameter {name} is None.")
        elif torch.isinf(param.grad).any():
            print(f"Gradient for parameter {name} contains inf values.")
        elif torch.isnan(param.grad).any():
            print(f"Gradient for parameter {name} contains NaN values.")


if __name__ == "__main__":
    args = get_args()

    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging.info(f"Using device {device}")

    # Change here to adapt to your data
    # n_channels=3 for RGB images
    # n_classes is the number of probabilities you want to get per pixel
    # model = UNet(n_channels=3, n_classes=args.classes, bilinear=args.bilinear)

    # ==================================================================================================
    model = FusionNet(radar_channels=1, camera_channels=3, n_classes=args.classes, bilinear=args.bilinear)
    # ==================================================================================================
    if torch.cuda.device_count() > 1:  # for multi-GPU
        model = torch.nn.DataParallel(model)

    # model = model.to(memory_format=torch.channels_last)
    # Access model attributes correctly depending on whether DataParallel is used
    model_instance = model.module if isinstance(model, torch.nn.DataParallel) else model
    logging.info(
        f"Network:\n"
        f"\t{model_instance.radar_channels + model_instance.camera_channels} input channels\n"
        f"\t{model_instance.n_classes} output channels (classes)\n"
        f'\t{"Bilinear" if model_instance.bilinear else "Transposed conv"} upscaling'
    )

    if args.load:
        state_dict = torch.load(args.load, map_location=device)
        del state_dict["mask_values"]
        model.load_state_dict(state_dict)
        logging.info(f"Model loaded from {args.load}")

    model.to(device=device)
    try:
        train_model(
            model=model,
            epochs=args.epochs,
            batch_size=args.batch_size,
            learning_rate=args.lr,
            device=device,
            img_scale=args.scale,
            val_percent=args.val / 100,
            amp=args.amp,
        )
    except torch.cuda.OutOfMemoryError:
        logging.error(
            "Detected OutOfMemoryError! "
            "Enabling checkpointing to reduce memory usage, but this slows down training. "
            "Consider enabling AMP (--amp) for fast and memory efficient training"
        )
        torch.cuda.empty_cache()
        model.use_checkpointing()
        train_model(
            model=model,
            epochs=args.epochs,
            batch_size=args.batch_size,
            learning_rate=args.lr,
            device=device,
            img_scale=args.scale,
            val_percent=args.val / 100,
            amp=args.amp,
        )


模型中参数的梯度为 None 表示在反向传播期间,这些参数没有收到任何梯度更新。这通常表示的模型中存在一些问题,导致计算图中的某些部分未连接到损失函数,或者在计算梯度时出现错误。

以下是的代码中可能导致此问题的一些原因:

  1. 融合机制的实现: 正在使用简单的加法来融合来自雷达和相机编码器的特征图 ( x3_fusion = x3_radar + x3_camera )。虽然这在技术上是有效的,但它可能不会创建反向传播可以有效地通过的梯度流。尝试使用更复杂或可学习的融合机制,例如:

  2. 连接: 连接两个特征图,然后使用卷积层来学习融合表示。

  3. 注意力机制: 使用注意力机制来学习根据输入动态地权衡来自两个编码器的特征。

  4. 上采样操作: 正在使用 UpsampleWithExactSize 模块,该模块基于 F.interpolate 。虽然这对于上采样很有用,但如果的目标是进行语义分割等密集预测任务,则它可能不是理想的选择。 F.interpolate 在执行上采样时不会考虑学习到的特征,这可能会影响梯度流。考虑使用转置卷积层进行上采样,因为它们允许模型学习如何以更有效的方式对特征进行上采样。

  5. 损失函数: 请确保损失函数的定义正确,并且正在正确计算。检查是否存在可能导致梯度计算错误的任何潜在错误,例如零除或对数中的零。

  6. 数据预处理和加载: 请仔细检查的数据加载和预处理步骤。确保将数据正确归一化并且输入模型的形状正确。数据加载中的任何问题都可能导致梯度计算出现意外行为。

  7. 网络初始化: 网络权重的初始化方式可能会影响梯度流。尝试不同的初始化策略,例如 He 初始化或 Xavier 初始化,以查看它们是否改善了梯度流。

调试步骤:

  1. 简化的模型: 从一个更简单的模型开始,只有几个层,并逐渐添加更多层,同时在每一步检查梯度。这将有助于隔离导致问题的特定层或模块。
  2. 可视化梯度: 使用 Tensorboard 或类似的工具来可视化训练期间不同层的梯度。这可以帮助识别梯度消失或爆炸的区域。
  3. 逐步调试: 在的代码中使用 print 语句或调试器来检查不同点的张量形状和梯度值。这可以帮助跟踪数据流并识别出现问题的位置。

通过仔细检查的代码并执行这些调试步骤,应该能够识别导致梯度为 None 的原因,并解决模型中的问题。

标签:python,machine-learning,pytorch,conv-neural-network,semantic-segmentation
From: 78810687

相关文章

  • Python - Creating alternative initializers using class Methods
    Classmethodsallowustodefinealternativeinitializers(alsoknownasfactorymethods)inaclass.Thesemethodshelpuscreateinstanceobjectsfromdifferenttypesofinputdata.Letusunderstandthiswiththehelpofanexample.Again,wetakethe......
  • 如何让 Python 请求信任自签名 SSL 证书?
    importrequestsdata={'foo':'bar'}url='https://foo.com/bar'r=requests.post(url,data=data)如果URL使用自签名证书,则会失败requests.exceptions.SSLError:[Errno1]_ssl.c:507:error:14090086:SSLroutines:SSL3_GET_SERVER_CERTIF......
  • python 偏函数
    如下代码loop=tornado.ioloop.IOLoop.current()ctx=contextvars.copy_context()func_call=functools.partial(ctx.run,func,*args,**kwargs)returnawaitloop.run_in_executor(executor,func_call)偏函数一个函数作为模板,通过提供部分参数来产生一个新的函数。......
  • Chapter 18 Python异常
    欢迎大家订阅【Python从入门到精通】专栏,一起探索Python的无限可能!文章目录前言一、什么是异常二、捕获异常三、异常的传递前言在Python中,异常是一种特定的对象,能够在程序运行过程中被抛出和处理。有效地管理异常不仅可以增强程序的稳定性,还可以提高用户体验,使程......
  • Python正则表达式匹配数字的第一次重复
    示例:For0123123123,1应匹配,因为第二个1出现在任何其他数字重复之前。For01234554321,5应该匹配,因为第二个5出现在任何其他数字的重复之前。我尝试过的一些正则表达式:......
  • 当 python 极坐标中某些列条目为空时,如何分解 List[_] 列?
    给定如下所示的Polarsdf,如何在两列上调用explode(),同时将空条目扩展到正确的长度以与其行匹配?shape:(3,2)┌───────────┬─────────────────────┐│x┆y││---┆---......
  • 从CNN到Transformer:基于PyTorch的遥感影像、无人机影像的地物分类、目标检测、语义分
    原文链接:从CNN到Transformer:基于PyTorch的遥感影像、无人机影像的地物分类、目标检测、语义分割和点云分类教程https://mp.weixin.qq.com/s?__biz=MzUzNTczMDMxMg==&mid=2247610610&idx=5&sn=f973c3e430c89d6123ca8f4892086c55&chksm=fa827115cdf5f8036ef8111c6f06cf592a8c0587......
  • 使用python从网站自动下载pdf时出错
    我想从一个名为epadossier.nl的网站自动批量下载pdf我用谷歌搜索了这个并找到了一段代码并修复了一个小错误。所以现在我得到了这个`importrequestsurl="https://www.epadossier.nl/adres/plaats/straat/num"response=requests.get(url)ifresponse.status_cod......
  • 避免字符串连接的嵌套循环的 Pythonic 方法
    我想找到所有5位数字的字符串,其中前三位数字在我的第一个列表中,第二个槽第四个数字在我的第二个列表中,第三到第五个数字在我的最后一个列表中:l0=["123","567","451"]l1=["234","239","881"]l2=["348","551","399"......
  • Python 环境配置(二)安装jupyter、matplotlib、numpy库
    Python环境配置(二)安装jupyter、matplotlib、numpy库一、numpypipinstallnumpy二、matplotlibpipinstallmatplotlib三、jupyter1、anaconda自带Jupyter2、pycharm插件只有Pycharm的Professional版才支持JupyterNotebook,请注意版本3、新建文件#%......