使用pdb进行detr3d代码走读（一）

标签：features detr3d 走读 eps bias 1e 256 True pdb

花了一天在新pc上调通了mmdetection3d和detr3d的代码环境_mmdetection detr3d-CSDN博客

书接上文

结合bash tools/dist_test.sh projects/DETR3D/configs/detr3d_r101_gridmask.py ~/Downloads/detr3d_r101_gridmask.pth 1

入口应该是

然后打断点进去阅读一下：

进入main函数之后首先是初始化长长的一大串config：

(Pdb) cfg
Config (path: projects/DETR3D/configs/detr3d_r101_gridmask.py): {'default_scope': 'mmdet3d', 'default_hooks': {'timer': {'type': 'IterTimerHook'}, 'logger': {'type': 'LoggerHook', 'interval': 50}, 'param_scheduler': {'type': 'ParamSchedulerHook'}, 'checkpoint': {'type': 'CheckpointHook', 'interval': 1, 'max_keep_ckpts': 1, 'save_last': True}, 'sampler_seed': {'type': 'DistSamplerSeedHook'}, 'visualization': {'type': 'Det3DVisualizationHook'}}, 'env_cfg': {'cudnn_benchmark': False, 'mp_cfg': {'mp_start_method': 'fork', 'opencv_num_threads': 0}, 'dist_cfg': {'backend': 'nccl'}}, 'log_processor': {'type': 'LogProcessor', 'window_size': 50, 'by_epoch': True}, 'log_level': 'INFO', 'load_from': 'ckpts/fcos3d.pth', 'resume': False, 'custom_imports': {'imports': ['projects.DETR3D.detr3d']}, 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'voxel_size': [0.2, 0.2, 8], 'img_norm_cfg': {'mean': [103.53, 116.28, 123.675], 'std': [1.0, 1.0, 1.0], 'bgr_to_rgb': False}, 'class_names': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'], 'input_modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'model': {'type': 'DETR3D', 'use_grid_mask': True, 'data_preprocessor': {'type': 'Det3DDataPreprocessor', 'mean': [103.53, 116.28, 123.675], 'std': [1.0, 1.0, 1.0], 'bgr_to_rgb': False, 'pad_size_divisor': 32}, 'img_backbone': {'type': 'mmdet.ResNet', 'depth': 101, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'frozen_stages': 1, 'norm_cfg': {'type': 'BN2d', 'requires_grad': False}, 'norm_eval': True, 'style': 'caffe', 'dcn': {'type': 'DCNv2', 'deform_groups': 1, 'fallback_on_stride': False}, 'stage_with_dcn': (False, False, True, True)}, 'img_neck': {'type': 'mmdet.FPN', 'in_channels': [256, 512, 1024, 2048], 'out_channels': 256, 'start_level': 1, 'add_extra_convs': 'on_output', 'num_outs': 4, 'relu_before_extra_convs': True}, 'pts_bbox_head': {'type': 'DETR3DHead', 'num_query': 900, 'num_classes': 10, 'in_channels': 256, 'sync_cls_avg_factor': True, 'with_box_refine': True, 'as_two_stage': False, 'transformer': {'type': 'Detr3DTransformer', 'decoder': {'type': 'Detr3DTransformerDecoder', 'num_layers': 6, 'return_intermediate': True, 'transformerlayers': {'type': 'mmdet.DetrTransformerDecoderLayer', 'attn_cfgs': [{'type': 'MultiheadAttention', 'embed_dims': 256, 'num_heads': 8, 'dropout': 0.1}, {'type': 'Detr3DCrossAtten', 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'num_points': 1, 'embed_dims': 256}], 'feedforward_channels': 512, 'ffn_dropout': 0.1, 'operation_order': ('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')}}}, 'bbox_coder': {'type': 'NMSFreeCoder', 'post_center_range': [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'max_num': 300, 'voxel_size': [0.2, 0.2, 8], 'num_classes': 10}, 'positional_encoding': {'type': 'mmdet.SinePositionalEncoding', 'num_feats': 128, 'normalize': True, 'offset': -0.5}, 'loss_cls': {'type': 'mmdet.FocalLoss', 'use_sigmoid': True, 'gamma': 2.0, 'alpha': 0.25, 'loss_weight': 2.0}, 'loss_bbox': {'type': 'mmdet.L1Loss', 'loss_weight': 0.25}, 'loss_iou': {'type': 'mmdet.GIoULoss', 'loss_weight': 0.0}}, 'train_cfg': {'pts': {'grid_size': [512, 512, 1], 'voxel_size': [0.2, 0.2, 8], 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'out_size_factor': 4, 'assigner': {'type': 'HungarianAssigner3D', 'cls_cost': {'type': 'mmdet.FocalLossCost', 'weight': 2.0}, 'reg_cost': {'type': 'BBox3DL1Cost', 'weight': 0.25}, 'iou_cost': {'type': 'mmdet.IoUCost', 'weight': 0.0}, 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}}}}, 'dataset_type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'test_transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}], 'train_transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}], 'backend_args': None, 'train_pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'LoadAnnotations3D', 'with_bbox_3d': True, 'with_label_3d': True, 'with_attr_label': False}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'ObjectRangeFilter', 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}, {'type': 'ObjectNameFilter', 'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, {'type': 'Pack3DDetInputs', 'keys': ['img', 'gt_bboxes_3d', 'gt_labels_3d']}], 'test_pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'train_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': True}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_train.pkl', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'LoadAnnotations3D', 'with_bbox_3d': True, 'with_label_3d': True, 'with_attr_label': False}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'ObjectRangeFilter', 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}, {'type': 'ObjectNameFilter', 'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, {'type': 'Pack3DDetInputs', 'keys': ['img', 'gt_bboxes_3d', 'gt_labels_3d']}], 'load_type': 'frame_based', 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': False, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'val_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': False}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_val.pkl', 'load_type': 'frame_based', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': True, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'test_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': False}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_val.pkl', 'load_type': 'frame_based', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': True, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'val_evaluator': {'type': 'NuScenesMetric', 'data_root': 'data/nuscenes/', 'ann_file': 'data/nuscenes/nuscenes_infos_val.pkl', 'metric': 'bbox', 'backend_args': None}, 'test_evaluator': {'type': 'NuScenesMetric', 'data_root': 'data/nuscenes/', 'ann_file': 'data/nuscenes/nuscenes_infos_val.pkl', 'metric': 'bbox', 'backend_args': None}, 'optim_wrapper': {'type': 'OptimWrapper', 'optimizer': {'type': 'AdamW', 'lr': 0.0002, 'weight_decay': 0.01}, 'paramwise_cfg': {'custom_keys': {'img_backbone': {'lr_mult': 0.1}}}, 'clip_grad': {'max_norm': 35, 'norm_type': 2}}, 'param_scheduler': [{'type': 'LinearLR', 'start_factor': 0.3333333333333333, 'by_epoch': False, 'begin': 0, 'end': 500}, {'type': 'CosineAnnealingLR', 'by_epoch': True, 'begin': 0, 'end': 24, 'T_max': 24, 'eta_min_ratio': 0.001}], 'total_epochs': 24, 'train_cfg': {'type': 'EpochBasedTrainLoop', 'max_epochs': 24, 'val_interval': 2}, 'val_cfg': {'type': 'ValLoop'}, 'test_cfg': {'type': 'TestLoop'}, 'vis_backends': [{'type': 'TensorboardVisBackend'}], 'visualizer': {'type': 'Det3DLocalVisualizer', 'vis_backends': [{'type': 'TensorboardVisBackend'}], 'name': 'visualizer'}}

一路进行初始化配置后在这里构造runner：

开始测试：

装载test数据集

加载checkpoint：

通过runner property testloop方法一直找到函数调用栈直到eval：

mmengine框架（？）的eval方法实现：

调用了torch自己的train mode方法：

在这里，module为模型结构实例，通过查看module，我们看到了详细的detr3d的模型结构（字数很多）：

DETR3D(
(data_preprocessor): Det3DDataPreprocessor()
(pts_bbox_head): DETR3DHead(
    (loss_cls): FocalLoss()
    (loss_bbox): L1Loss()
    (loss_iou): GIoULoss()
    (activate): ReLU(inplace=True)
    (positional_encoding): SinePositionalEncoding(num_feats=128, temperature=10000, normalize=True, scale=6.283185307179586, eps=1e-06)
    (transformer): Detr3DTransformer(
      (decoder): Detr3DTransformerDecoder(
        (layers): ModuleList(
          (0): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
          (1): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
          (2): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
          (3): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
          (4): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
          (5): DetrTransformerDecoderLayer(
            (attentions): ModuleList(
              (0): MultiheadAttention(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
                )
                (proj_drop): Dropout(p=0.0, inplace=False)
                (dropout_layer): Dropout(p=0.1, inplace=False)
              )
              (1): Detr3DCrossAtten(
                (dropout): Dropout(p=0.1, inplace=False)
                (attention_weights): Linear(in_features=256, out_features=24, bias=True)
                (output_proj): Linear(in_features=256, out_features=256, bias=True)
                (position_encoder): Sequential(
                  (0): Linear(in_features=3, out_features=256, bias=True)
                  (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (2): ReLU(inplace=True)
                  (3): Linear(in_features=256, out_features=256, bias=True)
                  (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                  (5): ReLU(inplace=True)
                )
              )
            )
            (ffns): ModuleList(
              (0): FFN(
                (layers): Sequential(
                  (0): Sequential(
                    (0): Linear(in_features=256, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Dropout(p=0.1, inplace=False)
                  )
                  (1): Linear(in_features=512, out_features=256, bias=True)
                  (2): Dropout(p=0.1, inplace=False)
                )
                (dropout_layer): Identity()
                (gamma2): Identity()
              )
            )
            (norms): ModuleList(
              (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
              (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
            )
          )
        )
      )
      (reference_points): Linear(in_features=256, out_features=3, bias=True)
    )
    (cls_branches): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
      (1): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
      (2): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
      (3): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
      (4): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
      (5): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (2): ReLU(inplace=True)
        (3): Linear(in_features=256, out_features=256, bias=True)
        (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (5): ReLU(inplace=True)
        (6): Linear(in_features=256, out_features=10, bias=True)
      )
    )
    (reg_branches): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
      (1): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
      (2): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
      (3): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
      (4): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
      (5): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
        (4): Linear(in_features=256, out_features=10, bias=True)
      )
    )
    (query_embedding): Embedding(900, 512)
)
(img_backbone): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): ResLayer(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (1): Bottleneck(
        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (2): Bottleneck(
        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
    )
    (layer2): ResLayer(
      (0): Bottleneck(
        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (1): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (2): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (3): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
    )
    (layer3): ResLayer(
      (0): Bottleneck(
        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (1): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (2): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (3): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (4): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (5): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (6): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (7): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (8): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (9): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (10): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (11): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (12): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (13): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (14): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (15): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (16): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (17): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (18): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (19): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (20): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (21): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (22): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
    )
    (layer4): ResLayer(
      (0): Bottleneck(
        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (1): Bottleneck(
        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
      (2): Bottleneck(
        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): ModulatedDeformConv2dPack(
          (conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
    )
)
init_cfg=[{'type': 'Kaiming', 'layer': 'Conv2d'}, {'type': 'Constant', 'val': 1, 'layer': ['_BatchNorm', 'GroupNorm']}]
(img_neck): FPN(
    (lateral_convs): ModuleList(
      (0): ConvModule(
        (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (1): ConvModule(
        (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (2): ConvModule(
        (conv): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (fpn_convs): ModuleList(
      (0): ConvModule(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): ConvModule(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (2): ConvModule(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (3): ConvModule(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      )
    )
)
init_cfg={'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}
(grid_mask): GridMask()
)

再进一步到代码中，发现这时候才进入到dataloader开始遍历数据。还以为遍历module已经开始eval了，但是好像仍然是初始化实例的过程。

开始遍历这个for循环，发现这时出现了一个叫nms_free_coder的代码调用，这个应当是具体的业务逻辑。所以这里已经实际上开始执行了detr3d的推理过程。故而，下面要做的事情是来穿刺具体的运行流程。

可以看到dataloader数量是81和之前的预期的数据集大小符合。

进入到这个运行具体流程的封装，return中包含了函数调用以及参数解包。

进入func的调用栈，一路穿刺之后找到outputs = self.runner.model.test_step(data_batch)，这是典型的输出 = 模型（输入）的结构，感觉快找到了。

return self.module.test_step(data)应该是关键动作：

从distribute到mmengine中basemodel的代码，这里test_step更为关键：

这里跳转到run_forward：断点可以看出来当前的模式是推理模式，data是一个dict。

查看一下输入数据的格式，发现mmdet是有自己的数据封装，暂时还不明白。input中image的维度，猜测是1batchsize * 6 views * 3 history frames * H * W

继续断点。终于从engine中进入到mmdet3d的业务封装了。这里是一个叫做detector的类，应该是一种检测器通用类，离真正的进入到detr3d的业务代码核心，还稍有距离。

从函数注释中（人家的函数注释写的真漂亮呀），可以明白有几种模式。loss模式是正常的训练模式，而推理会走另一个分支。有三种模式，暂时不了解细节：

41             - "tensor": Forward the whole network and return tensor or tuple of
42             tensor without any post-processing, same as a common nn.Module.
43             - "predict": Forward and return the predictions, which are fully
44             processed to a list of :obj:`Det3DDataSample`.
45             - "loss": Forward and return a dict of losses according to the given
46             inputs and data samples.

继续断点。终于进入到了detr3d的推理代码：

    # original simple_test
    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                batch_data_samples: List[Det3DDataSample],
                **kwargs) -> List[Det3DDataSample]:
        """Forward of testing.

        Args:
            batch_inputs_dict (dict): The model input dict which include
                `imgs` keys.

                - imgs (torch.Tensor): Tensor of batched multi-view images.
                    It has shape (B, N, C, H ,W)
            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
                Samples. It usually includes information such as
                `gt_instance_3d`.

        Returns:
            list[:obj:`Det3DDataSample`]: Detection results of the
            input sample. Each Det3DDataSample usually contain
            'pred_instances_3d'. And the ``pred_instances_3d`` usually
            contains following keys.

            - scores_3d (Tensor): Classification scores, has a shape
                (num_instances, )
            - labels_3d (Tensor): Labels of bboxes, has a shape
                (num_instances, ).
            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
                contains a tensor with shape (num_instances, 9).
        """
        batch_input_metas = [item.metainfo for item in batch_data_samples]
        batch_input_metas = self.add_lidar2img(batch_input_metas)
        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
        outs = self.pts_bbox_head(img_feats, batch_input_metas)

        results_list_3d = self.pts_bbox_head.predict_by_feat(
            outs, batch_input_metas, **kwargs)

        # change the bboxes' format
        detsamples = self.add_pred_to_datasample(batch_data_samples,
                                                 results_list_3d)
        return detsamples

阅读函数注释，得到一些信息：

之前对imgs的维度的猜测是正确的
Det3DDataSample包含一些key。在这里，这些key是scores，labels和bbox。在自动驾驶领域，可以理解成置信度、标签类别、以及目标位置（以及朝向，尺寸，属性等）

通过断点，可以看到，是一个经典的检测器流程。

数据前处理
输入图像特征提取
任务头
输出后处理

我们快速跳过，发现从这里之后，代码流程又回到了引擎侧进行datasample遍历。也就是说。刚才就是核心算法代码。

---------

第二次调试：

----------

经过刚才的第一轮，调整一下断点的位置，这次可以直接把断点打在detr3d的代码里面。这次设置两个断点位置，分别是detr3d的类初始化位置和predict推理位置。

from typing import Dict, List, Optional

import torch
from torch import Tensor

from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
from mmdet3d.registry import MODELS
from mmdet3d.structures import Det3DDataSample
from mmdet3d.structures.bbox_3d.utils import get_lidar2img
from .grid_mask import GridMask


@MODELS.register_module()
class DETR3D(MVXTwoStageDetector):
    """DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries

    Args:
        data_preprocessor (dict or ConfigDict, optional): The pre-process
            config of :class:`Det3DDataPreprocessor`. Defaults to None.
        use_grid_mask (bool) : Data augmentation. Whether to mask out some
            grids during extract_img_feat. Defaults to False.
        img_backbone (dict, optional): Backbone of extracting
            images feature. Defaults to None.
        img_neck (dict, optional): Neck of extracting
            image features. Defaults to None.
        pts_bbox_head (dict, optional): Bboxes head of
            detr3d. Defaults to None.
        train_cfg (dict, optional): Train config of model.
            Defaults to None.
        test_cfg (dict, optional): Train config of model.
            Defaults to None.
        init_cfg (dict, optional): Initialize config of
            model. Defaults to None.
    """

    def __init__(self,
                 data_preprocessor=None,
                 use_grid_mask=False,
                 img_backbone=None,
                 img_neck=None,
                 pts_bbox_head=None,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None):
        breakpoint()
        super(DETR3D, self).__init__(
            img_backbone=img_backbone,
            img_neck=img_neck,
            pts_bbox_head=pts_bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            data_preprocessor=data_preprocessor)
        self.grid_mask = GridMask(
            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
        self.use_grid_mask = use_grid_mask

    def extract_img_feat(self, img: Tensor,
                         batch_input_metas: List[dict]) -> List[Tensor]:
        """Extract features from images.

        Args:
            img (tensor): Batched multi-view image tensor with
                shape (B, N, C, H, W).
            batch_input_metas (list[dict]): Meta information of multiple inputs
                in a batch.

        Returns:
             list[tensor]: multi-level image features.
        """

        B = img.size(0)
        if img is not None:
            input_shape = img.shape[-2:]  # bs nchw
            # update real input shape of each single img
            for img_meta in batch_input_metas:
                img_meta.update(input_shape=input_shape)

            if img.dim() == 5 and img.size(0) == 1:
                img.squeeze_()
            elif img.dim() == 5 and img.size(0) > 1:
                B, N, C, H, W = img.size()
                img = img.view(B * N, C, H, W)
            if self.use_grid_mask:
                img = self.grid_mask(img)  # mask out some grids
            img_feats = self.img_backbone(img)
            if isinstance(img_feats, dict):
                img_feats = list(img_feats.values())
        else:
            return None
        if self.with_img_neck:
            img_feats = self.img_neck(img_feats)

        img_feats_reshaped = []
        for img_feat in img_feats:
            BN, C, H, W = img_feat.size()
            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
        return img_feats_reshaped

    def extract_feat(self, batch_inputs_dict: Dict,
                     batch_input_metas: List[dict]) -> List[Tensor]:
        """Extract features from images.

        Refer to self.extract_img_feat()
        """
        imgs = batch_inputs_dict.get('imgs', None)
        img_feats = self.extract_img_feat(imgs, batch_input_metas)
        return img_feats

    def _forward(self):
        raise NotImplementedError('tensor mode is yet to add')

    # original forward_train
    def loss(self, batch_inputs_dict: Dict[List, Tensor],
             batch_data_samples: List[Det3DDataSample],
             **kwargs) -> List[Det3DDataSample]:
        """
        Args:
            batch_inputs_dict (dict): The model input dict which include
                `imgs` keys.
                - imgs (torch.Tensor): Tensor of batched multi-view  images.
                    It has shape (B, N, C, H ,W)
            batch_data_samples (List[obj:`Det3DDataSample`]): The Data Samples
                It usually includes information such as `gt_instance_3d`.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.

        """
        batch_input_metas = [item.metainfo for item in batch_data_samples]
        batch_input_metas = self.add_lidar2img(batch_input_metas)
        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
        outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs)

        batch_gt_instances_3d = [
            item.gt_instances_3d for item in batch_data_samples
        ]
        loss_inputs = [batch_gt_instances_3d, outs]
        losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs)

        return losses_pts

    # original simple_test
    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                batch_data_samples: List[Det3DDataSample],
                **kwargs) -> List[Det3DDataSample]:
        """Forward of testing.

        Args:
            batch_inputs_dict (dict): The model input dict which include
                `imgs` keys.

                - imgs (torch.Tensor): Tensor of batched multi-view images.
                    It has shape (B, N, C, H ,W)
            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
                Samples. It usually includes information such as
                `gt_instance_3d`.

        Returns:
            list[:obj:`Det3DDataSample`]: Detection results of the
            input sample. Each Det3DDataSample usually contain
            'pred_instances_3d'. And the ``pred_instances_3d`` usually
            contains following keys.

            - scores_3d (Tensor): Classification scores, has a shape
                (num_instances, )
            - labels_3d (Tensor): Labels of bboxes, has a shape
                (num_instances, ).
            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
                contains a tensor with shape (num_instances, 9).
        """
        breakpoint()
        batch_input_metas = [item.metainfo for item in batch_data_samples]
        batch_input_metas = self.add_lidar2img(batch_input_metas)
        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
        outs = self.pts_bbox_head(img_feats, batch_input_metas)

        results_list_3d = self.pts_bbox_head.predict_by_feat(
            outs, batch_input_metas, **kwargs)

        # change the bboxes' format
        detsamples = self.add_pred_to_datasample(batch_data_samples,
                                                 results_list_3d)
        return detsamples

    # may need speed-up
    def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]:
        """add 'lidar2img' transformation matrix into batch_input_metas.

        Args:
            batch_input_metas (list[dict]): Meta information of multiple inputs
                in a batch.

        Returns:
            batch_input_metas (list[dict]): Meta info with lidar2img added
        """
        for meta in batch_input_metas:
            l2i = list()
            for i in range(len(meta['cam2img'])):
                c2i = torch.tensor(meta['cam2img'][i]).double()
                l2c = torch.tensor(meta['lidar2cam'][i]).double()
                l2i.append(get_lidar2img(c2i, l2c).float().numpy())
            meta['lidar2img'] = l2i
        return batch_input_metas

标签：features,detr3d,走读,eps,bias,1e,256,True,pdb
From： https://blog.csdn.net/Yougrianes/article/details/143134914

使用pdb进行detr3d代码走读（一）

相关文章

赞助商

阅读排行