花了一天在新pc上调通了mmdetection3d和detr3d的代码环境_mmdetection detr3d-CSDN博客
书接上文
结合bash tools/dist_test.sh projects/DETR3D/configs/detr3d_r101_gridmask.py ~/Downloads/detr3d_r101_gridmask.pth 1
入口应该是
然后打断点进去阅读一下:
进入main函数之后首先是初始化长长的一大串config:
(Pdb) cfg
Config (path: projects/DETR3D/configs/detr3d_r101_gridmask.py): {'default_scope': 'mmdet3d', 'default_hooks': {'timer': {'type': 'IterTimerHook'}, 'logger': {'type': 'LoggerHook', 'interval': 50}, 'param_scheduler': {'type': 'ParamSchedulerHook'}, 'checkpoint': {'type': 'CheckpointHook', 'interval': 1, 'max_keep_ckpts': 1, 'save_last': True}, 'sampler_seed': {'type': 'DistSamplerSeedHook'}, 'visualization': {'type': 'Det3DVisualizationHook'}}, 'env_cfg': {'cudnn_benchmark': False, 'mp_cfg': {'mp_start_method': 'fork', 'opencv_num_threads': 0}, 'dist_cfg': {'backend': 'nccl'}}, 'log_processor': {'type': 'LogProcessor', 'window_size': 50, 'by_epoch': True}, 'log_level': 'INFO', 'load_from': 'ckpts/fcos3d.pth', 'resume': False, 'custom_imports': {'imports': ['projects.DETR3D.detr3d']}, 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'voxel_size': [0.2, 0.2, 8], 'img_norm_cfg': {'mean': [103.53, 116.28, 123.675], 'std': [1.0, 1.0, 1.0], 'bgr_to_rgb': False}, 'class_names': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'], 'input_modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'model': {'type': 'DETR3D', 'use_grid_mask': True, 'data_preprocessor': {'type': 'Det3DDataPreprocessor', 'mean': [103.53, 116.28, 123.675], 'std': [1.0, 1.0, 1.0], 'bgr_to_rgb': False, 'pad_size_divisor': 32}, 'img_backbone': {'type': 'mmdet.ResNet', 'depth': 101, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'frozen_stages': 1, 'norm_cfg': {'type': 'BN2d', 'requires_grad': False}, 'norm_eval': True, 'style': 'caffe', 'dcn': {'type': 'DCNv2', 'deform_groups': 1, 'fallback_on_stride': False}, 'stage_with_dcn': (False, False, True, True)}, 'img_neck': {'type': 'mmdet.FPN', 'in_channels': [256, 512, 1024, 2048], 'out_channels': 256, 'start_level': 1, 'add_extra_convs': 'on_output', 'num_outs': 4, 'relu_before_extra_convs': True}, 'pts_bbox_head': {'type': 'DETR3DHead', 'num_query': 900, 'num_classes': 10, 'in_channels': 256, 'sync_cls_avg_factor': True, 'with_box_refine': True, 'as_two_stage': False, 'transformer': {'type': 'Detr3DTransformer', 'decoder': {'type': 'Detr3DTransformerDecoder', 'num_layers': 6, 'return_intermediate': True, 'transformerlayers': {'type': 'mmdet.DetrTransformerDecoderLayer', 'attn_cfgs': [{'type': 'MultiheadAttention', 'embed_dims': 256, 'num_heads': 8, 'dropout': 0.1}, {'type': 'Detr3DCrossAtten', 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'num_points': 1, 'embed_dims': 256}], 'feedforward_channels': 512, 'ffn_dropout': 0.1, 'operation_order': ('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')}}}, 'bbox_coder': {'type': 'NMSFreeCoder', 'post_center_range': [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'max_num': 300, 'voxel_size': [0.2, 0.2, 8], 'num_classes': 10}, 'positional_encoding': {'type': 'mmdet.SinePositionalEncoding', 'num_feats': 128, 'normalize': True, 'offset': -0.5}, 'loss_cls': {'type': 'mmdet.FocalLoss', 'use_sigmoid': True, 'gamma': 2.0, 'alpha': 0.25, 'loss_weight': 2.0}, 'loss_bbox': {'type': 'mmdet.L1Loss', 'loss_weight': 0.25}, 'loss_iou': {'type': 'mmdet.GIoULoss', 'loss_weight': 0.0}}, 'train_cfg': {'pts': {'grid_size': [512, 512, 1], 'voxel_size': [0.2, 0.2, 8], 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], 'out_size_factor': 4, 'assigner': {'type': 'HungarianAssigner3D', 'cls_cost': {'type': 'mmdet.FocalLossCost', 'weight': 2.0}, 'reg_cost': {'type': 'BBox3DL1Cost', 'weight': 0.25}, 'iou_cost': {'type': 'mmdet.IoUCost', 'weight': 0.0}, 'pc_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}}}}, 'dataset_type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'test_transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}], 'train_transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}], 'backend_args': None, 'train_pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'LoadAnnotations3D', 'with_bbox_3d': True, 'with_label_3d': True, 'with_attr_label': False}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'ObjectRangeFilter', 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}, {'type': 'ObjectNameFilter', 'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, {'type': 'Pack3DDetInputs', 'keys': ['img', 'gt_bboxes_3d', 'gt_labels_3d']}], 'test_pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'train_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': True}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_train.pkl', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'LoadAnnotations3D', 'with_bbox_3d': True, 'with_label_3d': True, 'with_attr_label': False}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'PhotoMetricDistortion3D'}, {'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'ObjectRangeFilter', 'point_cloud_range': [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]}, {'type': 'ObjectNameFilter', 'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, {'type': 'Pack3DDetInputs', 'keys': ['img', 'gt_bboxes_3d', 'gt_labels_3d']}], 'load_type': 'frame_based', 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': False, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'val_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': False}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_val.pkl', 'load_type': 'frame_based', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': True, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'test_dataloader': {'batch_size': 1, 'num_workers': 4, 'persistent_workers': True, 'drop_last': False, 'sampler': {'type': 'DefaultSampler', 'shuffle': False}, 'dataset': {'type': 'NuScenesDataset', 'data_root': 'data/nuscenes/', 'ann_file': 'nuscenes_infos_val.pkl', 'load_type': 'frame_based', 'pipeline': [{'type': 'LoadMultiViewImageFromFiles', 'to_float32': True, 'num_views': 6, 'backend_args': None}, {'type': 'MultiViewWrapper', 'transforms': [{'type': 'RandomResize3D', 'scale': (1600, 900), 'ratio_range': (1.0, 1.0), 'keep_ratio': True}]}, {'type': 'Pack3DDetInputs', 'keys': ['img']}], 'metainfo': {'classes': ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']}, 'modality': {'use_lidar': False, 'use_camera': True, 'use_radar': False, 'use_map': False, 'use_external': False}, 'test_mode': True, 'data_prefix': {'pts': '', 'CAM_FRONT': 'samples/CAM_FRONT', 'CAM_FRONT_LEFT': 'samples/CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT': 'samples/CAM_FRONT_RIGHT', 'CAM_BACK': 'samples/CAM_BACK', 'CAM_BACK_RIGHT': 'samples/CAM_BACK_RIGHT', 'CAM_BACK_LEFT': 'samples/CAM_BACK_LEFT'}, 'box_type_3d': 'LiDAR', 'backend_args': None}}, 'val_evaluator': {'type': 'NuScenesMetric', 'data_root': 'data/nuscenes/', 'ann_file': 'data/nuscenes/nuscenes_infos_val.pkl', 'metric': 'bbox', 'backend_args': None}, 'test_evaluator': {'type': 'NuScenesMetric', 'data_root': 'data/nuscenes/', 'ann_file': 'data/nuscenes/nuscenes_infos_val.pkl', 'metric': 'bbox', 'backend_args': None}, 'optim_wrapper': {'type': 'OptimWrapper', 'optimizer': {'type': 'AdamW', 'lr': 0.0002, 'weight_decay': 0.01}, 'paramwise_cfg': {'custom_keys': {'img_backbone': {'lr_mult': 0.1}}}, 'clip_grad': {'max_norm': 35, 'norm_type': 2}}, 'param_scheduler': [{'type': 'LinearLR', 'start_factor': 0.3333333333333333, 'by_epoch': False, 'begin': 0, 'end': 500}, {'type': 'CosineAnnealingLR', 'by_epoch': True, 'begin': 0, 'end': 24, 'T_max': 24, 'eta_min_ratio': 0.001}], 'total_epochs': 24, 'train_cfg': {'type': 'EpochBasedTrainLoop', 'max_epochs': 24, 'val_interval': 2}, 'val_cfg': {'type': 'ValLoop'}, 'test_cfg': {'type': 'TestLoop'}, 'vis_backends': [{'type': 'TensorboardVisBackend'}], 'visualizer': {'type': 'Det3DLocalVisualizer', 'vis_backends': [{'type': 'TensorboardVisBackend'}], 'name': 'visualizer'}}
一路进行初始化配置后在这里构造runner:
开始测试:
装载test数据集
加载checkpoint:
通过runner property testloop方法一直找到函数调用栈直到eval:
mmengine框架(?)的eval方法实现:
调用了torch自己的train mode方法:
在这里,module为模型结构实例,通过查看module,我们看到了详细的detr3d的模型结构(字数很多):
DETR3D(
(data_preprocessor): Det3DDataPreprocessor()
(pts_bbox_head): DETR3DHead(
(loss_cls): FocalLoss()
(loss_bbox): L1Loss()
(loss_iou): GIoULoss()
(activate): ReLU(inplace=True)
(positional_encoding): SinePositionalEncoding(num_feats=128, temperature=10000, normalize=True, scale=6.283185307179586, eps=1e-06)
(transformer): Detr3DTransformer(
(decoder): Detr3DTransformerDecoder(
(layers): ModuleList(
(0): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(1): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(2): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(3): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(4): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(5): DetrTransformerDecoderLayer(
(attentions): ModuleList(
(0): MultiheadAttention(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): Dropout(p=0.1, inplace=False)
)
(1): Detr3DCrossAtten(
(dropout): Dropout(p=0.1, inplace=False)
(attention_weights): Linear(in_features=256, out_features=24, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(position_encoder): Sequential(
(0): Linear(in_features=3, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
)
)
)
(ffns): ModuleList(
(0): FFN(
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=256, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.1, inplace=False)
)
(1): Linear(in_features=512, out_features=256, bias=True)
(2): Dropout(p=0.1, inplace=False)
)
(dropout_layer): Identity()
(gamma2): Identity()
)
)
(norms): ModuleList(
(0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(reference_points): Linear(in_features=256, out_features=3, bias=True)
)
(cls_branches): ModuleList(
(0): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(1): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(2): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(3): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(4): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(5): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=256, out_features=256, bias=True)
(4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
)
(reg_branches): ModuleList(
(0): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
(1): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
(2): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
(3): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
(4): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
(5): Sequential(
(0): Linear(in_features=256, out_features=256, bias=True)
(1): ReLU()
(2): Linear(in_features=256, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=10, bias=True)
)
)
(query_embedding): Embedding(900, 512)
)
(img_backbone): ResNet(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): ResLayer(
(0): Bottleneck(
(conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(1): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(2): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
)
(layer2): ResLayer(
(0): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(1): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(2): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(3): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
)
(layer3): ResLayer(
(0): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(1): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(2): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(3): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(4): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(5): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(6): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(7): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(8): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(9): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(10): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(11): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(12): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(13): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(14): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(15): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(16): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(17): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(18): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(19): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(20): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(21): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(22): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(256, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
)
(layer4): ResLayer(
(0): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(1): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
(2): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): ModulatedDeformConv2dPack(
(conv_offset): Conv2d(512, 27, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
)
)
init_cfg=[{'type': 'Kaiming', 'layer': 'Conv2d'}, {'type': 'Constant', 'val': 1, 'layer': ['_BatchNorm', 'GroupNorm']}]
(img_neck): FPN(
(lateral_convs): ModuleList(
(0): ConvModule(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): ConvModule(
(conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
)
(2): ConvModule(
(conv): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
)
)
(fpn_convs): ModuleList(
(0): ConvModule(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ConvModule(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ConvModule(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): ConvModule(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
init_cfg={'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}
(grid_mask): GridMask()
)
再进一步到代码中,发现这时候才进入到dataloader开始遍历数据。还以为遍历module已经开始eval了,但是好像仍然是初始化实例的过程。
开始遍历这个for循环,发现这时出现了一个叫nms_free_coder的代码调用,这个应当是具体的业务逻辑。所以这里已经实际上开始执行了detr3d的推理过程。故而,下面要做的事情是来穿刺具体的运行流程。
可以看到dataloader数量是81和之前的预期的数据集大小符合。
进入到这个运行具体流程的封装,return中包含了函数调用以及参数解包。
进入func的调用栈,一路穿刺之后找到outputs = self.runner.model.test_step(data_batch),这是典型的输出 = 模型(输入)的结构,感觉快找到了。
return self.module.test_step(data)应该是关键动作:
从distribute到mmengine中basemodel的代码,这里test_step更为关键:
这里跳转到run_forward:断点可以看出来当前的模式是推理模式,data是一个dict。
查看一下输入数据的格式,发现mmdet是有自己的数据封装,暂时还不明白。input中image的维度,猜测是1batchsize * 6 views * 3 history frames * H * W
继续断点。终于从engine中进入到mmdet3d的业务封装了。这里是一个叫做detector的类,应该是一种检测器通用类,离真正的进入到detr3d的业务代码核心,还稍有距离。
从函数注释中(人家的函数注释写的真漂亮呀),可以明白有几种模式。loss模式是正常的训练模式,而推理会走另一个分支。有三种模式,暂时不了解细节:
41 - "tensor": Forward the whole network and return tensor or tuple of
42 tensor without any post-processing, same as a common nn.Module.
43 - "predict": Forward and return the predictions, which are fully
44 processed to a list of :obj:`Det3DDataSample`.
45 - "loss": Forward and return a dict of losses according to the given
46 inputs and data samples.
继续断点。终于进入到了detr3d的推理代码:
# original simple_test
def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
"""Forward of testing.
Args:
batch_inputs_dict (dict): The model input dict which include
`imgs` keys.
- imgs (torch.Tensor): Tensor of batched multi-view images.
It has shape (B, N, C, H ,W)
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance_3d`.
Returns:
list[:obj:`Det3DDataSample`]: Detection results of the
input sample. Each Det3DDataSample usually contain
'pred_instances_3d'. And the ``pred_instances_3d`` usually
contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
contains a tensor with shape (num_instances, 9).
"""
batch_input_metas = [item.metainfo for item in batch_data_samples]
batch_input_metas = self.add_lidar2img(batch_input_metas)
img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
outs = self.pts_bbox_head(img_feats, batch_input_metas)
results_list_3d = self.pts_bbox_head.predict_by_feat(
outs, batch_input_metas, **kwargs)
# change the bboxes' format
detsamples = self.add_pred_to_datasample(batch_data_samples,
results_list_3d)
return detsamples
阅读函数注释,得到一些信息:
- 之前对imgs的维度的猜测是正确的
- Det3DDataSample包含一些key。在这里,这些key是scores,labels和bbox。在自动驾驶领域,可以理解成置信度、标签类别、以及目标位置(以及朝向,尺寸,属性等)
通过断点,可以看到,是一个经典的检测器流程。
- 数据前处理
- 输入图像特征提取
- 任务头
- 输出后处理
我们快速跳过,发现从这里之后,代码流程又回到了引擎侧进行datasample遍历。也就是说。刚才就是核心算法代码。
---------
第二次调试:
----------
经过刚才的第一轮,调整一下断点的位置,这次可以直接把断点打在detr3d的代码里面。这次设置两个断点位置,分别是detr3d的类初始化位置和predict推理位置。
from typing import Dict, List, Optional
import torch
from torch import Tensor
from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
from mmdet3d.registry import MODELS
from mmdet3d.structures import Det3DDataSample
from mmdet3d.structures.bbox_3d.utils import get_lidar2img
from .grid_mask import GridMask
@MODELS.register_module()
class DETR3D(MVXTwoStageDetector):
"""DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries
Args:
data_preprocessor (dict or ConfigDict, optional): The pre-process
config of :class:`Det3DDataPreprocessor`. Defaults to None.
use_grid_mask (bool) : Data augmentation. Whether to mask out some
grids during extract_img_feat. Defaults to False.
img_backbone (dict, optional): Backbone of extracting
images feature. Defaults to None.
img_neck (dict, optional): Neck of extracting
image features. Defaults to None.
pts_bbox_head (dict, optional): Bboxes head of
detr3d. Defaults to None.
train_cfg (dict, optional): Train config of model.
Defaults to None.
test_cfg (dict, optional): Train config of model.
Defaults to None.
init_cfg (dict, optional): Initialize config of
model. Defaults to None.
"""
def __init__(self,
data_preprocessor=None,
use_grid_mask=False,
img_backbone=None,
img_neck=None,
pts_bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
breakpoint()
super(DETR3D, self).__init__(
img_backbone=img_backbone,
img_neck=img_neck,
pts_bbox_head=pts_bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
data_preprocessor=data_preprocessor)
self.grid_mask = GridMask(
True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
self.use_grid_mask = use_grid_mask
def extract_img_feat(self, img: Tensor,
batch_input_metas: List[dict]) -> List[Tensor]:
"""Extract features from images.
Args:
img (tensor): Batched multi-view image tensor with
shape (B, N, C, H, W).
batch_input_metas (list[dict]): Meta information of multiple inputs
in a batch.
Returns:
list[tensor]: multi-level image features.
"""
B = img.size(0)
if img is not None:
input_shape = img.shape[-2:] # bs nchw
# update real input shape of each single img
for img_meta in batch_input_metas:
img_meta.update(input_shape=input_shape)
if img.dim() == 5 and img.size(0) == 1:
img.squeeze_()
elif img.dim() == 5 and img.size(0) > 1:
B, N, C, H, W = img.size()
img = img.view(B * N, C, H, W)
if self.use_grid_mask:
img = self.grid_mask(img) # mask out some grids
img_feats = self.img_backbone(img)
if isinstance(img_feats, dict):
img_feats = list(img_feats.values())
else:
return None
if self.with_img_neck:
img_feats = self.img_neck(img_feats)
img_feats_reshaped = []
for img_feat in img_feats:
BN, C, H, W = img_feat.size()
img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
return img_feats_reshaped
def extract_feat(self, batch_inputs_dict: Dict,
batch_input_metas: List[dict]) -> List[Tensor]:
"""Extract features from images.
Refer to self.extract_img_feat()
"""
imgs = batch_inputs_dict.get('imgs', None)
img_feats = self.extract_img_feat(imgs, batch_input_metas)
return img_feats
def _forward(self):
raise NotImplementedError('tensor mode is yet to add')
# original forward_train
def loss(self, batch_inputs_dict: Dict[List, Tensor],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
"""
Args:
batch_inputs_dict (dict): The model input dict which include
`imgs` keys.
- imgs (torch.Tensor): Tensor of batched multi-view images.
It has shape (B, N, C, H ,W)
batch_data_samples (List[obj:`Det3DDataSample`]): The Data Samples
It usually includes information such as `gt_instance_3d`.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
batch_input_metas = [item.metainfo for item in batch_data_samples]
batch_input_metas = self.add_lidar2img(batch_input_metas)
img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs)
batch_gt_instances_3d = [
item.gt_instances_3d for item in batch_data_samples
]
loss_inputs = [batch_gt_instances_3d, outs]
losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs)
return losses_pts
# original simple_test
def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
"""Forward of testing.
Args:
batch_inputs_dict (dict): The model input dict which include
`imgs` keys.
- imgs (torch.Tensor): Tensor of batched multi-view images.
It has shape (B, N, C, H ,W)
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance_3d`.
Returns:
list[:obj:`Det3DDataSample`]: Detection results of the
input sample. Each Det3DDataSample usually contain
'pred_instances_3d'. And the ``pred_instances_3d`` usually
contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
contains a tensor with shape (num_instances, 9).
"""
breakpoint()
batch_input_metas = [item.metainfo for item in batch_data_samples]
batch_input_metas = self.add_lidar2img(batch_input_metas)
img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
outs = self.pts_bbox_head(img_feats, batch_input_metas)
results_list_3d = self.pts_bbox_head.predict_by_feat(
outs, batch_input_metas, **kwargs)
# change the bboxes' format
detsamples = self.add_pred_to_datasample(batch_data_samples,
results_list_3d)
return detsamples
# may need speed-up
def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]:
"""add 'lidar2img' transformation matrix into batch_input_metas.
Args:
batch_input_metas (list[dict]): Meta information of multiple inputs
in a batch.
Returns:
batch_input_metas (list[dict]): Meta info with lidar2img added
"""
for meta in batch_input_metas:
l2i = list()
for i in range(len(meta['cam2img'])):
c2i = torch.tensor(meta['cam2img'][i]).double()
l2c = torch.tensor(meta['lidar2cam'][i]).double()
l2i.append(get_lidar2img(c2i, l2c).float().numpy())
meta['lidar2img'] = l2i
return batch_input_metas
标签:features,detr3d,走读,eps,bias,1e,256,True,pdb
From: https://blog.csdn.net/Yougrianes/article/details/143134914