Repository: yichen928/SparseFusion Branch: main Commit: 22537781e033 Files: 516 Total size: 3.1 MB Directory structure: gitextract_232uyltz/ ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_zh-CN.md ├── configs/ │ ├── 3dssd/ │ │ ├── 3dssd_kitti-3d-car.py │ │ └── README.md │ ├── _base_/ │ │ ├── datasets/ │ │ │ ├── coco_instance.py │ │ │ ├── kitti-3d-3class.py │ │ │ ├── kitti-3d-car.py │ │ │ ├── lyft-3d.py │ │ │ ├── nuim_instance.py │ │ │ ├── nus-3d.py │ │ │ ├── range100_lyft-3d.py │ │ │ ├── scannet-3d-18class.py │ │ │ ├── sunrgbd-3d-10class.py │ │ │ ├── waymoD5-3d-3class.py │ │ │ └── waymoD5-3d-car.py │ │ ├── default_runtime.py │ │ ├── models/ │ │ │ ├── 3dssd.py │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ ├── centerpoint_01voxel_second_secfpn_nus.py │ │ │ ├── centerpoint_02pillar_second_secfpn_nus.py │ │ │ ├── h3dnet.py │ │ │ ├── hv_pointpillars_fpn_lyft.py │ │ │ ├── hv_pointpillars_fpn_nus.py │ │ │ ├── hv_pointpillars_fpn_range100_lyft.py │ │ │ ├── hv_pointpillars_secfpn_kitti.py │ │ │ ├── hv_pointpillars_secfpn_waymo.py │ │ │ ├── hv_second_secfpn_kitti.py │ │ │ ├── hv_second_secfpn_waymo.py │ │ │ ├── imvotenet_image.py │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ └── votenet.py │ │ └── schedules/ │ │ ├── cyclic_20e.py │ │ ├── cyclic_40e.py │ │ ├── mmdet_schedule_1x.py │ │ ├── schedule_2x.py │ │ └── schedule_3x.py │ ├── benchmark/ │ │ ├── hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py │ │ ├── hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py │ │ ├── hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py │ │ └── hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py │ ├── centerpoint/ │ │ ├── README.md │ │ ├── centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_flip-tta_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_flip-tta_20e_nus.py │ │ ├── centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_01voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py │ │ ├── centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py │ │ └── centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py │ ├── dynamic_voxelization/ │ │ ├── README.md │ │ ├── dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py │ │ ├── dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py │ │ └── dv_second_secfpn_6x8_80e_kitti-3d-car.py │ ├── fp16/ │ │ ├── README.md │ │ ├── hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py │ │ ├── hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py │ │ ├── hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py │ │ └── hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py │ ├── free_anchor/ │ │ ├── README.md │ │ ├── hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py │ │ ├── hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py │ │ └── hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py │ ├── h3dnet/ │ │ ├── README.md │ │ └── h3dnet_3x8_scannet-3d-18class.py │ ├── imvotenet/ │ │ ├── README.md │ │ ├── imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py │ │ └── imvotenet_stage2_16x8_sunrgbd-3d-10class.py │ ├── mvxnet/ │ │ ├── README.md │ │ └── dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py │ ├── nuimages/ │ │ ├── README.md │ │ ├── cascade_mask_rcnn_r101_fpn_1x_nuim.py │ │ ├── cascade_mask_rcnn_r50_fpn_1x_nuim.py │ │ ├── cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py │ │ ├── cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py │ │ ├── cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py │ │ ├── htc_r50_fpn_1x_nuim.py │ │ ├── htc_r50_fpn_coco-20e_1x_nuim.py │ │ ├── htc_r50_fpn_coco-20e_20e_nuim.py │ │ ├── htc_without_semantic_r50_fpn_1x_nuim.py │ │ ├── htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py │ │ ├── mask_rcnn_r101_fpn_1x_nuim.py │ │ ├── mask_rcnn_r50_caffe_fpn_1x_nuim.py │ │ ├── mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py │ │ ├── mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py │ │ ├── mask_rcnn_r50_fpn_1x_nuim.py │ │ ├── mask_rcnn_r50_fpn_coco-2x_1x_nuim.py │ │ ├── mask_rcnn_r50_fpn_coco-2x_1x_nus-2d.py │ │ ├── mask_rcnn_swinT_coco-2x_1x_nuim.py │ │ └── mask_rcnn_x101_32x4d_fpn_1x_nuim.py │ ├── nuscenes.md │ ├── parta2/ │ │ ├── README.md │ │ ├── hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py │ │ └── hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py │ ├── pointpillars/ │ │ ├── README.md │ │ ├── hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_fpn_sbn-all_range100_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py │ │ ├── hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py │ │ ├── hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py │ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py │ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py │ │ └── hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py │ ├── regnet/ │ │ ├── README.md │ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py │ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py │ │ ├── hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py │ │ └── hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py │ ├── second/ │ │ ├── README.md │ │ ├── hv_second_secfpn_6x8_80e_kitti-3d-3class.py │ │ ├── hv_second_secfpn_6x8_80e_kitti-3d-car.py │ │ └── hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py │ ├── sparsefusion_nusc_voxel_LC_SwinT.py │ ├── sparsefusion_nusc_voxel_LC_r50.py │ ├── ssn/ │ │ ├── README.md │ │ ├── hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py │ │ ├── hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py │ │ ├── hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py │ │ └── hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py │ ├── transfusion_nusc_pillar_L.py │ ├── transfusion_nusc_pillar_LC.py │ ├── transfusion_nusc_voxel_L.py │ ├── transfusion_nusc_voxel_LC.py │ ├── transfusion_waymo_voxel_L.py │ ├── transfusion_waymo_voxel_LC.py │ ├── votenet/ │ │ ├── README.md │ │ ├── votenet_16x8_sunrgbd-3d-10class.py │ │ ├── votenet_8x8_scannet-3d-18class.py │ │ └── votenet_iouloss_8x8_scannet-3d-18class.py │ └── waymo.md ├── demo/ │ └── pcd_demo.py ├── docker/ │ └── Dockerfile ├── mmdet3d/ │ ├── __init__.py │ ├── apis/ │ │ ├── __init__.py │ │ ├── inference.py │ │ └── test.py │ ├── core/ │ │ ├── __init__.py │ │ ├── anchor/ │ │ │ ├── __init__.py │ │ │ └── anchor_3d_generator.py │ │ ├── bbox/ │ │ │ ├── __init__.py │ │ │ ├── assigners/ │ │ │ │ ├── __init__.py │ │ │ │ └── hungarian_assigner.py │ │ │ ├── box_np_ops.py │ │ │ ├── coders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anchor_free_bbox_coder.py │ │ │ │ ├── camera_bbox_coder.py │ │ │ │ ├── centerpoint_bbox_coders.py │ │ │ │ ├── delta_xyzwhlr_bbox_coder.py │ │ │ │ ├── partial_bin_based_bbox_coder.py │ │ │ │ └── transfusion_bbox_coder.py │ │ │ ├── iou_calculators/ │ │ │ │ ├── __init__.py │ │ │ │ └── iou3d_calculator.py │ │ │ ├── samplers/ │ │ │ │ ├── __init__.py │ │ │ │ └── iou_neg_piecewise_sampler.py │ │ │ ├── structures/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_box3d.py │ │ │ │ ├── box_3d_mode.py │ │ │ │ ├── cam_box3d.py │ │ │ │ ├── coord_3d_mode.py │ │ │ │ ├── depth_box3d.py │ │ │ │ ├── lidar_box3d.py │ │ │ │ └── utils.py │ │ │ └── transforms.py │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ ├── indoor_eval.py │ │ │ ├── kitti_utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── eval.py │ │ │ │ └── rotate_iou.py │ │ │ ├── lyft_eval.py │ │ │ ├── seg_eval.py │ │ │ └── waymo_utils/ │ │ │ └── prediction_kitti_to_waymo.py │ │ ├── points/ │ │ │ ├── __init__.py │ │ │ ├── base_points.py │ │ │ ├── cam_points.py │ │ │ ├── depth_points.py │ │ │ └── lidar_points.py │ │ ├── post_processing/ │ │ │ ├── __init__.py │ │ │ ├── box3d_nms.py │ │ │ └── merge_augs.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ └── gaussian.py │ │ ├── visualizer/ │ │ │ ├── __init__.py │ │ │ ├── open3d_vis.py │ │ │ └── show_result.py │ │ └── voxel/ │ │ ├── __init__.py │ │ ├── builder.py │ │ └── voxel_generator.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── custom_3d.py │ │ ├── dataset_wrappers.py │ │ ├── kitti2d_dataset.py │ │ ├── kitti_dataset.py │ │ ├── lyft_dataset.py │ │ ├── nuscenes_dataset.py │ │ ├── nuscenes_dataset_viewInfo.py │ │ ├── pipelines/ │ │ │ ├── __init__.py │ │ │ ├── data_augment_utils.py │ │ │ ├── dbsampler.py │ │ │ ├── formating.py │ │ │ ├── loading.py │ │ │ ├── test_time_aug.py │ │ │ ├── transforms_2d.py │ │ │ └── transforms_3d.py │ │ ├── registry.py │ │ ├── scannet_dataset.py │ │ ├── semantickitti_dataset.py │ │ ├── sunrgbd_dataset.py │ │ └── waymo_dataset.py │ ├── models/ │ │ ├── __init__.py │ │ ├── backbones/ │ │ │ ├── DLA.py │ │ │ ├── __init__.py │ │ │ ├── base_pointnet.py │ │ │ ├── multi_backbone.py │ │ │ ├── nostem_regnet.py │ │ │ ├── pointnet2_sa_msg.py │ │ │ ├── pointnet2_sa_ssg.py │ │ │ ├── second.py │ │ │ └── swin.py │ │ ├── builder.py │ │ ├── dense_heads/ │ │ │ ├── __init__.py │ │ │ ├── anchor3d_head.py │ │ │ ├── base_conv_bbox_head.py │ │ │ ├── centerpoint_head.py │ │ │ ├── free_anchor3d_head.py │ │ │ ├── parta2_rpn_head.py │ │ │ ├── shape_aware_head.py │ │ │ ├── sparsefusion_head_deform.py │ │ │ ├── ssd_3d_head.py │ │ │ ├── train_mixins.py │ │ │ ├── transfusion_head.py │ │ │ └── vote_head.py │ │ ├── detectors/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── centerpoint.py │ │ │ ├── dynamic_voxelnet.py │ │ │ ├── h3dnet.py │ │ │ ├── imvotenet.py │ │ │ ├── mvx_faster_rcnn.py │ │ │ ├── mvx_two_stage.py │ │ │ ├── parta2.py │ │ │ ├── single_stage.py │ │ │ ├── sparsefusion.py │ │ │ ├── ssd3dnet.py │ │ │ ├── transfusion.py │ │ │ ├── two_stage.py │ │ │ ├── votenet.py │ │ │ └── voxelnet.py │ │ ├── fusion_layers/ │ │ │ ├── __init__.py │ │ │ ├── coord_transform.py │ │ │ ├── point_fusion.py │ │ │ └── vote_fusion.py │ │ ├── losses/ │ │ │ ├── __init__.py │ │ │ ├── axis_aligned_iou_loss.py │ │ │ ├── chamfer_distance.py │ │ │ └── uncertainty_loss.py │ │ ├── middle_encoders/ │ │ │ ├── __init__.py │ │ │ ├── pillar_scatter.py │ │ │ ├── sparse_encoder.py │ │ │ └── sparse_unet.py │ │ ├── model_utils/ │ │ │ ├── __init__.py │ │ │ └── vote_module.py │ │ ├── necks/ │ │ │ ├── __init__.py │ │ │ └── second_fpn.py │ │ ├── registry.py │ │ ├── roi_heads/ │ │ │ ├── __init__.py │ │ │ ├── base_3droi_head.py │ │ │ ├── bbox_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── h3d_bbox_head.py │ │ │ │ └── parta2_bbox_head.py │ │ │ ├── h3d_roi_head.py │ │ │ ├── mask_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── pointwise_semantic_head.py │ │ │ │ └── primitive_head.py │ │ │ ├── part_aggregation_roi_head.py │ │ │ └── roi_extractors/ │ │ │ ├── __init__.py │ │ │ └── single_roiaware_extractor.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── clip_sigmoid.py │ │ │ ├── deformable_decoder.py │ │ │ ├── depth_encoder.py │ │ │ ├── drop.py │ │ │ ├── ffn.py │ │ │ ├── inverse_sigmoid.py │ │ │ ├── mlp.py │ │ │ ├── network_modules.py │ │ │ ├── ops/ │ │ │ │ ├── functions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src/ │ │ │ │ │ ├── cpu/ │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ │ ├── cuda/ │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ │ ├── projection.py │ │ │ ├── sparsefusion_models.py │ │ │ ├── transformer.py │ │ │ └── transformerdecoder.py │ │ └── voxel_encoders/ │ │ ├── __init__.py │ │ ├── pillar_encoder.py │ │ ├── utils.py │ │ └── voxel_encoder.py │ ├── ops/ │ │ ├── __init__.py │ │ ├── ball_query/ │ │ │ ├── __init__.py │ │ │ ├── ball_query.py │ │ │ └── src/ │ │ │ ├── ball_query.cpp │ │ │ └── ball_query_cuda.cu │ │ ├── furthest_point_sample/ │ │ │ ├── __init__.py │ │ │ ├── furthest_point_sample.py │ │ │ ├── points_sampler.py │ │ │ ├── src/ │ │ │ │ ├── furthest_point_sample.cpp │ │ │ │ └── furthest_point_sample_cuda.cu │ │ │ └── utils.py │ │ ├── gather_points/ │ │ │ ├── __init__.py │ │ │ ├── gather_points.py │ │ │ └── src/ │ │ │ ├── gather_points.cpp │ │ │ └── gather_points_cuda.cu │ │ ├── group_points/ │ │ │ ├── __init__.py │ │ │ ├── group_points.py │ │ │ └── src/ │ │ │ ├── group_points.cpp │ │ │ └── group_points_cuda.cu │ │ ├── interpolate/ │ │ │ ├── __init__.py │ │ │ ├── src/ │ │ │ │ ├── interpolate.cpp │ │ │ │ ├── three_interpolate_cuda.cu │ │ │ │ └── three_nn_cuda.cu │ │ │ ├── three_interpolate.py │ │ │ └── three_nn.py │ │ ├── iou3d/ │ │ │ ├── __init__.py │ │ │ ├── iou3d_utils.py │ │ │ └── src/ │ │ │ ├── iou3d.cpp │ │ │ └── iou3d_kernel.cu │ │ ├── knn/ │ │ │ ├── __init__.py │ │ │ ├── knn.py │ │ │ └── src/ │ │ │ ├── knn.cpp │ │ │ └── knn_cuda.cu │ │ ├── norm.py │ │ ├── pointnet_modules/ │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── point_fp_module.py │ │ │ ├── point_sa_module.py │ │ │ └── registry.py │ │ ├── roiaware_pool3d/ │ │ │ ├── __init__.py │ │ │ ├── points_in_boxes.py │ │ │ ├── roiaware_pool3d.py │ │ │ └── src/ │ │ │ ├── points_in_boxes_cpu.cpp │ │ │ ├── points_in_boxes_cuda.cu │ │ │ ├── roiaware_pool3d.cpp │ │ │ └── roiaware_pool3d_kernel.cu │ │ ├── sparse_block.py │ │ ├── spconv/ │ │ │ ├── __init__.py │ │ │ ├── conv.py │ │ │ ├── functional.py │ │ │ ├── include/ │ │ │ │ ├── paramsgrid.h │ │ │ │ ├── prettyprint.h │ │ │ │ ├── pybind11_utils.h │ │ │ │ ├── spconv/ │ │ │ │ │ ├── fused_spconv_ops.h │ │ │ │ │ ├── geometry.h │ │ │ │ │ ├── indice.cu.h │ │ │ │ │ ├── indice.h │ │ │ │ │ ├── maxpool.h │ │ │ │ │ ├── mp_helper.h │ │ │ │ │ ├── point2voxel.h │ │ │ │ │ ├── pool_ops.h │ │ │ │ │ ├── reordering.cu.h │ │ │ │ │ ├── reordering.h │ │ │ │ │ └── spconv_ops.h │ │ │ │ ├── tensorview/ │ │ │ │ │ ├── helper_kernel.cu.h │ │ │ │ │ ├── helper_launch.h │ │ │ │ │ └── tensorview.h │ │ │ │ ├── torch_utils.h │ │ │ │ └── utility/ │ │ │ │ └── timer.h │ │ │ ├── modules.py │ │ │ ├── ops.py │ │ │ ├── overwrite_spconv/ │ │ │ │ └── write_spconv2.py │ │ │ ├── pool.py │ │ │ ├── src/ │ │ │ │ ├── all.cc │ │ │ │ ├── indice.cc │ │ │ │ ├── indice_cuda.cu │ │ │ │ ├── maxpool.cc │ │ │ │ ├── maxpool_cuda.cu │ │ │ │ ├── reordering.cc │ │ │ │ └── reordering_cuda.cu │ │ │ ├── structure.py │ │ │ └── test_utils.py │ │ └── voxel/ │ │ ├── __init__.py │ │ ├── scatter_points.py │ │ ├── src/ │ │ │ ├── scatter_points_cpu.cpp │ │ │ ├── scatter_points_cuda.cu │ │ │ ├── voxelization.cpp │ │ │ ├── voxelization.h │ │ │ ├── voxelization_cpu.cpp │ │ │ └── voxelization_cuda.cu │ │ └── voxelize.py │ ├── utils/ │ │ ├── __init__.py │ │ └── collect_env.py │ └── version.py ├── requirements/ │ ├── build.txt │ ├── docs.txt │ ├── optional.txt │ ├── readthedocs.txt │ ├── runtime.txt │ └── tests.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests/ │ ├── test_data/ │ │ ├── test_datasets/ │ │ │ ├── test_dataset_wrappers.py │ │ │ ├── test_kitti_dataset.py │ │ │ ├── test_lyft_dataset.py │ │ │ ├── test_nuscene_dataset.py │ │ │ ├── test_scannet_dataset.py │ │ │ ├── test_semantickitti_dataset.py │ │ │ └── test_sunrgbd_dataset.py │ │ └── test_pipelines/ │ │ ├── test_augmentations/ │ │ │ ├── test_data_augment_utils.py │ │ │ ├── test_test_augment_utils.py │ │ │ └── test_transforms_3d.py │ │ ├── test_indoor_pipeline.py │ │ ├── test_indoor_sample.py │ │ ├── test_loadings/ │ │ │ ├── test_load_points_from_multi_sweeps.py │ │ │ └── test_loading.py │ │ └── test_outdoor_pipeline.py │ ├── test_metrics/ │ │ ├── test_indoor_eval.py │ │ ├── test_kitti_eval.py │ │ ├── test_losses.py │ │ └── test_seg_eval.py │ ├── test_models/ │ │ ├── test_backbones.py │ │ ├── test_common_modules/ │ │ │ ├── test_middle_encoders.py │ │ │ ├── test_pointnet_modules.py │ │ │ ├── test_pointnet_ops.py │ │ │ ├── test_roiaware_pool3d.py │ │ │ ├── test_sparse_unet.py │ │ │ └── test_vote_module.py │ │ ├── test_detectors.py │ │ ├── test_forward.py │ │ ├── test_fusion/ │ │ │ ├── test_fusion_coord_trans.py │ │ │ ├── test_point_fusion.py │ │ │ └── test_vote_fusion.py │ │ ├── test_heads/ │ │ │ ├── test_heads.py │ │ │ ├── test_parta2_bbox_head.py │ │ │ ├── test_roi_extractors.py │ │ │ └── test_semantic_heads.py │ │ ├── test_necks/ │ │ │ ├── test_fpn.py │ │ │ └── test_necks.py │ │ └── test_voxel_encoder/ │ │ ├── test_dynamic_scatter.py │ │ ├── test_voxel_encoders.py │ │ ├── test_voxel_generator.py │ │ └── test_voxelize.py │ ├── test_runtime/ │ │ ├── test_apis.py │ │ └── test_config.py │ ├── test_samples/ │ │ └── parta2_roihead_inputs.npz │ └── test_utils/ │ ├── test_anchors.py │ ├── test_assigners.py │ ├── test_bbox_coders.py │ ├── test_box3d.py │ ├── test_box_np_ops.py │ ├── test_coord_3d_mode.py │ ├── test_merge_augs.py │ ├── test_nms.py │ ├── test_points.py │ ├── test_samplers.py │ └── test_utils.py ├── tools/ │ ├── analysis_tools/ │ │ ├── analyze_logs.py │ │ ├── benchmark.py │ │ └── get_flops.py │ ├── combine_view_info.py │ ├── create_data.py │ ├── create_data.sh │ ├── data_converter/ │ │ ├── __init__.py │ │ ├── create_gt_database.py │ │ ├── indoor_converter.py │ │ ├── kitti_converter.py │ │ ├── kitti_data_utils.py │ │ ├── lyft_converter.py │ │ ├── nuimage_converter.py │ │ ├── nuscenes_converter.py │ │ ├── scannet_data_utils.py │ │ ├── sunrgbd_data_utils.py │ │ └── waymo_converter.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── misc/ │ │ ├── fuse_conv_bn.py │ │ ├── print_config.py │ │ └── visualize_results.py │ ├── model_converters/ │ │ ├── convert_votenet_checkpoints.py │ │ ├── publish_model.py │ │ └── regnet2mmdet.py │ ├── slurm_test.sh │ ├── slurm_train.sh │ ├── test.py │ └── train.py └── train.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class *.ipynb # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # cython generated cpp data .vscode .idea # custom *.pkl *.pkl.json *.log.json work_dirs/ exps/ *~ # Pytorch *.pth # demo *.jpg *.png /data/scannet/scans/ /data/sunrgbd/OFFICIAL_SUNRGBD/ *.obj *.ply ================================================ FILE: LICENSE ================================================ Copyright 2018-2019 Open-MMLab. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2018-2019 Open-MMLab. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include requirements/*.txt include mmdet3d/ops/**/*.cpp mmdet3d/ops/**/*.cu include mmdet3d/ops/**/*.h mmdet3d/ops/**/*.cc include mmdet3d/VERSION ================================================ FILE: README.md ================================================ # [ICCV 2023] SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection ![video](video.gif) ## Abstract We propose SparseFusion, a novel multi-sensor 3D detection method that exclusively uses sparse candidates and sparse representations. Specifically, SparseFusion utilizes the outputs of parallel detectors in the LiDAR and camera modalities as sparse candidates for fusion. We transform the camera candidates into the LiDAR coordinate space by disentangling the object representations. Then, we can fuse the multi-modality candidates in a unified 3D space by a lightweight self-attention module. To mitigate negative transfer between modalities, we propose novel semantic and geometric cross-modality transfer modules that are applied prior to the modality-specific detectors. SparseFusion achieves state-of-the-art performance on the nuScenes benchmark while also running at the fastest speed. [[paper link]](https://openaccess.thecvf.com/content/ICCV2023/papers/Xie_SparseFusion_Fusing_Multi-Modal_Sparse_Representations_for_Multi-Sensor_3D_Object_Detection_ICCV_2023_paper.pdf) [[Chinese summary (自动驾驶之心)]](https://zhuanlan.zhihu.com/p/671293323) ## Updates [2023-8-21] Much better training GPU memory efficiency (45GB -> 29GB) with no hurt to the performance and speed! [2023-7-13] 🔥SparseFusion has been accepted to ICCV 2023!🔥 [2023-3-21] We release the first version code of SparseFusion. ## Overview ![teaser](teaser.png) Compared to existing fusion algorithms, SparseFusion achieves state-of-the-art performance as well as the fastest inference speed on nuScenes test set. †: Official [repository](https://github.com/zehuichen123/AutoAlignV2) of AutoAlignV2 uses flip as test-time augmentation. ‡: We use BEVFusion-base results in the official [repository](https://github.com/mit-han-lab/bevfusion) of BEVFusion to match the input resolutions of other methods. $\S:$ Swin-T is adopted as image backbone. ## NuScene Performance We do not use any test-time augmentations or model ensembles to get these results. We have released the configure files and pretrained checkpoints to reproduce our results. #### Validation Set | Image Backbone | Point Cloud Backbone | mAP | NDS | Link | | --------- | ------ | ------ | --------- | --------- | | ResNet50 | VoxelNet | 70.5 | 72.8 | [config](configs/sparsefusion_nusc_voxel_LC_r50.py)/[ckpt](https://drive.google.com/file/d/1NZIrg7s-VwxkwuPHTTWSQQO7T7IILBGC/view?usp=share_link) | | Swin-T | VoxelNet | 71.0 | 73.1 | [config](configs/sparsefusion_nusc_voxel_LC_SwinT.py)/[ckpt](https://drive.google.com/file/d/1dAhOKtbLd1e3I5jwk_3E1gzbl61P24qy/view?usp=share_link) | #### Test Set | Image Backbone | Point Cloud Backbone | mAP | NDS | | --------- | ------ | ------ | --------- | | ResNet50 | VoxelNet | 72.0 | 73.8 | ## Usage #### Installation + We test our code on an environment with CUDA 11.5, python 3.7, PyTorch 1.7.1, TorchVision 0.8.2, NumPy 1.20.0, and numba 0.48.0. + We use `mmdet==2.10.0, mmcv==1.2.7` for our code. Please refer to their official instructions for installation. + You can install `mmdet3d==0.11.0` directly from our repo by ``` cd SparseFusion pip install -e . ``` + We use `spconv==2.3.3`. Please follow the [official instruction](https://github.com/traveller59/spconv) to install it based on your CUDA version. ``` pip install spconv-cuxxx # e.g. pip install spconv-cu114 ``` + You also need to install the deformable attention module with the following command. ``` pip install ./mmdet3d/models/utils/ops ``` #### Data Preparation Download nuScenes full dataset from the [official website](https://www.nuscenes.org/download). You should have a folder structure like this: ``` SparseFusion ├── mmdet3d ├── tools ├── configs ├── data │ ├── nuscenes │ │ ├── maps │ │ ├── samples │ │ ├── sweeps │ │ ├── v1.0-test | | ├── v1.0-trainval ``` Then, you can select **either** of the two ways to preprocess the data. 1. Run the following two commands sequentially. ``` python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes python tools/combine_view_info.py ``` 2. Alternatively, you may directly download our preprocessed data from [Google Drive](https://drive.google.com/drive/folders/1L5lvLsNWBA0vfTlNSMa4OXXBLoZgJbg4?usp=share_link), and put these files in `data/nuscenes`. #### Initial Weights Please download the [initial weights](https://drive.google.com/drive/folders/1wmYBi3PBprdcegF843AU-22q2OwDgoZk?usp=share_link) for model training, and put them in `checkpoints/`. #### Train & Test In our default setting, we train the model with 4 GPUs. ``` # training bash tools/dist_train.sh configs/sparsefusion_nusc_voxel_LC_r50.py 4 --work-dir work_dirs/sparsefusion_nusc_voxel_LC_r50 # test bash tools/dist_test.sh configs/sparsefusion_nusc_voxel_LC_r50.py ${CHECKPOINT_FILE} 4 --eval=bbox ``` Note: We use A6000 GPUs (48GB per-GPU memory) for model training. The training of SparseFusion model (ResNet50 backbone) requires ~29 GB per-GPU memory. ## Contact If you have any questions, feel free to open an issue or contact us at yichen_xie@berkeley.edu. ## Acknowledgments We sincerely thank the authors of [mmdetection3d](https://github.com/open-mmlab/mmdetection3d), [TransFusion](https://github.com/XuyangBai/TransFusion), [BEVFusion](https://github.com/mit-han-lab/bevfusion), [MSMDFusion](https://github.com/SxJyJay/MSMDFusion), and [DeepInteraction](https://github.com/fudan-zvg/DeepInteraction) for providing their codes or pretrained weights. ## Reference If you find our work useful, please consider citing the following paper: ``` @article{xie2023sparsefusion, title={SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection}, author={Xie, Yichen and Xu, Chenfeng and Rakotosaona, Marie-Julie and Rim, Patrick and Tombari, Federico and Keutzer, Kurt and Tomizuka, Masayoshi and Zhan, Wei}, journal={arXiv preprint arXiv:2304.14340}, year={2023} } ``` ================================================ FILE: README_zh-CN.md ================================================
[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmdetection3d.readthedocs.io/en/latest/) [![badge](https://github.com/open-mmlab/mmdetection3d/workflows/build/badge.svg)](https://github.com/open-mmlab/mmdetection3d/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmdetection3d/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmdetection3d) [![license](https://img.shields.io/github/license/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/blob/master/LICENSE) **新闻**: 我们发布了版本v0.11.0. 在第三届[ nuScenes 3D 检测挑战赛](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any)(第五届 AI Driving Olympics, NeurIPS 2020)中,我们获得了最佳 PKL 奖、第三名和最好的纯视觉的结果,相关的代码和模型将会在不久后发布。 文档: https://mmdetection3d.readthedocs.io/ ## 简介 [English](README.md) | 简体中文 主分支代码目前支持 PyTorch 1.3 以上的版本。 MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱, 下一代面向3D检测的平台. 它是 OpenMMlab 项目的一部分,这个项目由香港中文大学多媒体实验室和商汤科技联合发起. ![demo image](resources/mmdet3d_outdoor_demo.gif) ### 主要特性 - **支持多模态/单模态的检测器** 支持多模态/单模态检测器,包括 MVXNet,VoteNet,PointPillars 等。 - **支持户内/户外的数据集** 支持室内/室外的3D检测数据集,包括 ScanNet, SUNRGB-D, Waymo, nuScenes, Lyft, KITTI. 对于 nuScenes 数据集, 我们也支持 [nuImages 数据集](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/nuimages). - **与 2D 检测器的自然整合** [MMDetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/model_zoo.md) 支持的**300+个模型 , 40+的论文算法**, 和相关模块都可以在此代码库中训练或使用。 - **性能高** 训练速度比其他代码库更快。下表可见主要的对比结果。更多的细节可见[基准测评文档](./docs/benchmarks.md)。我们对比了每秒训练的样本数(值越高越好)。其他代码库不支持的模型被标记为 `×`。 | Methods | MMDetection3D | [OpenPCDet](https://github.com/open-mmlab/OpenPCDet) |[votenet](https://github.com/facebookresearch/votenet)| [Det3D](https://github.com/poodarchu/Det3D) | |:-------:|:-------------:|:---------:|:-----:|:-----:| | VoteNet | 358 | × | 77 | × | | PointPillars-car| 141 | × | × | 140 | | PointPillars-3class| 107 |44 | × | × | | SECOND| 40 |30 | × | × | | Part-A2| 17 |14 | × | × | 和 [MMDetection](https://github.com/open-mmlab/mmdetection),[MMCV](https://github.com/open-mmlab/mmcv) 一样, MMDetection3D 也可以作为一个库去支持各式各样的项目. ## 开源许可证 该项目采用 [Apache 2.0 开源许可证](LICENSE)。 ## 更新日志 最新的版本 v0.11.0 在 2021.03.01发布。 如果想了解更多版本更新细节和历史信息,请阅读[更新日志](docs/changelog.md)。 ## 基准测试和模型库 测试结果和模型可以在[模型库](docs/model_zoo.md)中找到。 已支持的骨干网络: - [x] PointNet (CVPR'2017) - [x] PointNet++ (NeurIPS'2017) - [x] RegNet (CVPR'2020) 已支持的算法: - [x] [SECOND (Sensor'2018)](configs/second/README.md) - [x] [PointPillars (CVPR'2019)](configs/pointpillars/README.md) - [x] [FreeAnchor (NeurIPS'2019)](configs/free_anchor/README.md) - [x] [VoteNet (ICCV'2019)](configs/votenet/README.md) - [x] [H3DNet (ECCV'2020)](configs/h3dnet/README.md) - [x] [3DSSD (CVPR'2020)](configs/3dssd/README.md) - [x] [Part-A2 (TPAMI'2020)](configs/parta2/README.md) - [x] [MVXNet (ICRA'2019)](configs/mvxnet/README.md) - [x] [CenterPoint (CVPR'2021)](configs/centerpoint/README.md) - [x] [SSN (ECCV'2020)](configs/ssn/README.md) - [x] [ImVoteNet (CVPR'2020)](configs/imvotenet/README.md) | | ResNet | ResNeXt | SENet |PointNet++ | HRNet | RegNetX | Res2Net | |--------------------|:--------:|:--------:|:--------:|:---------:|:-----:|:--------:|:-----:| | SECOND | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | PointPillars | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | FreeAnchor | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | | H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | | 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | | Part-A2 | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | MVXNet | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | CenterPoint | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | SSN | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ | | ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | 其他特性 - [x] [Dynamic Voxelization](configs/dynamic_voxelization/README.md) **注意:** [MMDetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/model_zoo.md) 支持的基于2D检测的**300+个模型 , 40+的论文算法**在 MMDetection3D 中都可以被训练或使用。 ## 安装 请参考[快速入门文档](docs/get_started.md)进行安装。 ## 快速入门 请参考[快速入门文档](docs/get_started.md)学习 MMDetection3D 的基本使用。 我们为新手提供了分别针对[已有数据集](docs/1_exist_data_model.md)和[新数据集](docs/2_new_data_model.md)的使用指南。我们也提供了一些进阶教程,内容覆盖了[学习配置文件](docs/tutorials/config.md), [增加数据集支持](docs/tutorials/customize_dataset.md), [设计新的数据预处理流程](docs/tutorials/data_pipeline.md), [增加自定义模型](docs/tutorials/customize_models.md), [增加自定义的运行时配置](docs/tutorials/customize_runtime.md)和 [Waymo 数据集](docs/tutorials/waymo.md). ## 引用 如果你觉得本项目对你的研究工作有所帮助,请参考如下 bibtex 引用 MMdetection3D ```latex @misc{mmdet3d2020, title={{MMDetection3D: OpenMMLab} next-generation platform for general {3D} object detection}, author={MMDetection3D Contributors}, howpublished = {\url{https://github.com/open-mmlab/mmdetection3d}}, year={2020} } ``` ## 贡献指南 我们感谢所有的贡献者为改进和提升 MMDetection3D 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。 ## 致谢 MMDetection3D 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现已有算法并开发自己的新的 3D 检测模型。 ## OpenMMLab 的其他项目 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 ================================================ FILE: configs/3dssd/3dssd_kitti-3d-car.py ================================================ _base_ = [ '../_base_/models/3dssd.py', '../_base_/datasets/kitti-3d-car.py', '../_base_/default_runtime.py' ] # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] point_cloud_range = [0, -40, -5, 70, 40, 3] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectSample', db_sampler=db_sampler), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0], global_rot_range=[0.0, 0.0], rot_range=[-1.0471975511965976, 1.0471975511965976]), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.9, 1.1]), dict(type='BackgroundPointsFilter', bbox_enlarge_range=(0.5, 2.0, 0.5)), dict(type='IndoorPointSample', num_points=16384), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='IndoorPointSample', num_points=16384), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict(dataset=dict(pipeline=train_pipeline)), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) evaluation = dict(interval=2) # model settings model = dict( bbox_head=dict( num_classes=1, bbox_coder=dict( type='AnchorFreeBBoxCoder', num_dir_bins=12, with_rot=True))) # optimizer lr = 0.002 # max learning rate optimizer = dict(type='AdamW', lr=lr, weight_decay=0) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict(policy='step', warmup=None, step=[80, 120]) # runtime settings total_epochs = 150 # yapf:disable log_config = dict( interval=30, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable ================================================ FILE: configs/3dssd/README.md ================================================ # 3DSSD: Point-based 3D Single Stage Object Detector ## Introduction [ALGORITHM] We implement 3DSSD and provide the results and checkpoints on KITTI datasets. ``` @inproceedings{yang20203dssd, author = {Zetong Yang and Yanan Sun and Shu Liu and Jiaya Jia}, title = {3DSSD: Point-based 3D Single Stage Object Detector}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, year = {2020} } ``` ### Experiment details on KITTI datasets Some settings in our implementation are different from the [official implementation](https://github.com/Jia-Research-Lab/3DSSD), which bring marginal differences to the performance on KITTI datasets in our experiments. To simplify and unify the models of our implementation, we skip them in our models. These differences are listed as below: 1. We keep the scenes without any object while the official code skips these scenes in training. In the official implementation, only 3229 and 3394 samples are used as training and validation sets, respectively. In our implementation, we keep using 3712 and 3769 samples as training and validation sets, respectively, as those used for all the other models in our implementation on KITTI datasets. 2. We do not modify the decay of `batch normalization` during training. 3. While using [`DataBaseSampler`](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/dbsampler.py#L80) for data augmentation, the official code uses road planes as reference to place the sampled objects while we do not. 4. We perform detection using LIDAR coordinates while the official code uses camera coordinates. ## Results ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet2SAMSG](./3dssd_kitti-3d-car.py)| Car |72e|4.7||78.39(81.00)1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210324_122002-07e9a19b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210324_122002.log.json)| [1]: We report two different 3D object detection performance here. 78.39mAP is evaluated by our evaluation code and 81.00mAP is evaluated by the official development kit (so as that used in the paper and official code of 3DSSD ). We found that the commonly used Python implementation of [`rotate_iou`](https://github.com/traveller59/second.pytorch/blob/e42e4a0e17262ab7d180ee96a0a36427f2c20a44/second/core/non_max_suppression/nms_gpu.py#L605) which is used in our KITTI dataset evaluation, is different from the official implemention in [KITTI benchmark](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). ================================================ FILE: configs/_base_/datasets/coco_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: configs/_base_/datasets/kitti-3d-3class.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=1) ================================================ FILE: configs/_base_/datasets/kitti-3d-car.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=1) ================================================ FILE: configs/_base_/datasets/lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-80, -80, -5, 80, 80, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24) ================================================ FILE: configs/_base_/datasets/nuim_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/nuimages/' class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-train.json', img_prefix=data_root, classes=class_names, pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: configs/_base_/datasets/nus-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/nuscenes/': 's3://nuscenes/nuscenes/', # 'data/nuscenes/': 's3://nuscenes/nuscenes/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24) ================================================ FILE: configs/_base_/datasets/range100_lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-100, -100, -5, 100, 100, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24) ================================================ FILE: configs/_base_/datasets/scannet-3d-18class.py ================================================ # dataset settings dataset_type = 'ScanNetDataset' data_root = './data/scannet/' class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_mask_3d=True, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39)), dict(type='IndoorPointSample', num_points=40000), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.087266, 0.087266], scale_ratio_range=[1.0, 1.0], shift_height=True), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask' ]) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='IndoorPointSample', num_points=40000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_train.pkl', pipeline=train_pipeline, filter_empty_gt=False, classes=class_names, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth')) ================================================ FILE: configs/_base_/datasets/sunrgbd-3d-10class.py ================================================ dataset_type = 'SUNRGBDDataset' data_root = 'data/sunrgbd/' class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict(type='LoadAnnotations3D'), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict( type='GlobalRotScaleTrans', rot_range=[-0.523599, 0.523599], scale_ratio_range=[0.85, 1.15], shift_height=True), dict(type='IndoorPointSample', num_points=20000), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict(type='IndoorPointSample', num_points=20000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=16, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_train.pkl', pipeline=train_pipeline, classes=class_names, filter_empty_gt=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth')) ================================================ FILE: configs/_base_/datasets/waymoD5-3d-3class.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car', 'Pedestrian', 'Cyclist'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), points_loader=dict( type='LoadPointsFromFile', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24) ================================================ FILE: configs/_base_/datasets/waymoD5-3d-car.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24) ================================================ FILE: configs/_base_/default_runtime.py ================================================ checkpoint_config = dict(interval=1) # yapf:disable push # By default we use textlogger hook and tensorboard # For more loggers see # https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/_base_/models/3dssd.py ================================================ model = dict( type='SSD3DNet', backbone=dict( type='PointNet2SAMSG', in_channels=4, num_points=(4096, 512, (256, 256)), radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 64, 128), (64, 96, 128)), ((128, 128, 256), (128, 192, 256), (128, 256, 256))), aggregation_channels=(64, 128, 256), fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), fps_sample_range_lists=((-1), (-1), (512, -1)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), bbox_head=dict( type='SSD3DHead', in_channels=256, vote_module_cfg=dict( in_channels=256, num_points=256, gt_per_seed=1, conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), with_res_feat=False, vote_xyz_range=(3.0, 3.0, 2.0)), vote_aggregation_cfg=dict( type='PointSAModuleMSG', num_point=256, radii=(4.8, 6.4), sample_nums=(16, 32), mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), use_xyz=True, normalize_xyz=False, bias=True), pred_layer_cfg=dict( in_channels=1536, shared_conv_channels=(512, 128), cls_conv_channels=(128, ), reg_conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), objectness_loss=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), center_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), corner_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05), test_cfg=dict( nms_cfg=dict(type='nms', iou_thr=0.1), sample_mod='spec', score_thr=0.0, per_class_proposal=True, max_output_num=100)) # optimizer # This schedule is mainly used by models on indoor dataset, # e.g., VoteNet on SUNRGBD and ScanNet lr = 0.002 # max learning rate optimizer = dict(type='AdamW', lr=lr, weight_decay=0) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict(policy='step', warmup=None, step=[80, 120]) # runtime settings total_epochs = 150 ================================================ FILE: configs/_base_/models/cascade_mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='CascadeRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), roi_head=dict( type='CascadeRoIHead', num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py ================================================ voxel_size = [0.1, 0.1, 0.2] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1024, 1024], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([256, 256]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[1024, 1024, 40], voxel_size=voxel_size, out_size_factor=8, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py ================================================ voxel_size = [0.2, 0.2, 8] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='PillarFeatureNet', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=(0.2, 0.2, 8), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), legacy=False), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), pts_backbone=dict( type='SECOND', in_channels=64, out_channels=[64, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], out_channels=[128, 128, 128], upsample_strides=[0.5, 1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([128, 128, 128]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=4, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, out_size_factor=4, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, pc_range=[-51.2, -51.2], out_size_factor=4, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: configs/_base_/models/h3dnet.py ================================================ primitive_z_cfg = dict( type='PrimitiveHead', num_dims=2, num_classes=18, primitive_mode='z', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_xy_cfg = dict( type='PrimitiveHead', num_dims=1, num_classes=18, primitive_mode='xy', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_line_cfg = dict( type='PrimitiveHead', num_dims=0, num_classes=18, primitive_mode='line', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=2.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) model = dict( type='H3DNet', backbone=dict( type='MultiBackbone', num_streams=4, suffixes=['net0', 'net1', 'net2', 'net3'], conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), act_cfg=dict(type='ReLU'), backbones=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True))), rpn_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), roi_head=dict( type='H3DRoIHead', primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg], bbox_head=dict( type='H3DBboxHead', gt_per_seed=3, num_proposal=256, suface_matching_cfg=dict( type='PointSAModule', num_point=256 * 6, radius=0.5, num_sample=32, mlp_channels=[128 + 6, 128, 64, 32], use_xyz=True, normalize_xyz=True), line_matching_cfg=dict( type='PointSAModule', num_point=256 * 12, radius=0.5, num_sample=32, mlp_channels=[128 + 12, 128, 64, 32], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), primitive_refine_channels=[128, 128, 128], upper_thresh=100.0, surface_thresh=0.5, line_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), cues_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), cues_semantic_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), proposal_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='none', loss_weight=5.0), primitive_center_loss=dict( type='MSELoss', reduction='none', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), rpn_proposal=dict(use_nms=False), rcnn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote', far_threshold=0.6, near_threshold=0.3, mask_surface_threshold=0.3, label_surface_threshold=0.3, mask_line_threshold=0.3, label_line_threshold=0.3)), test_cfg=dict( rpn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True, use_nms=False), rcnn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-80, -80, -5, 80, 80, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]), pts_middle_encoder=dict(output_shape=[640, 640]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_nus.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.25, 0.25, 8] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=64, point_cloud_range=[-50, -50, -5, 50, 50, 3], voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=4, feat_channels=[64, 64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-50, -50, -5, 50, 50, 3], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='FPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), act_cfg=dict(type='ReLU'), in_channels=[64, 128, 256], out_channels=256, start_level=0, num_outs=3), pts_bbox_head=dict( type='Anchor3DHead', num_classes=10, in_channels=256, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ [0.8660, 2.5981, 1.], # 1.5/sqrt(3) [0.5774, 1.7321, 1.], # 1/sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.2, score_thr=0.05, min_bbox_size=0, max_num=500))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-100, -100, -5, 100, 100, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), pts_middle_encoder=dict(output_shape=[800, 800]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: configs/_base_/models/hv_pointpillars_secfpn_kitti.py ================================================ voxel_size = [0.16, 0.16, 4] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=32, point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], voxel_size=voxel_size, max_voxels=(16000, 40000)), voxel_encoder=dict( type='PillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]), middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), backbone=dict( type='SECOND', in_channels=64, layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -39.68, -0.6, 70.4, 39.68, -0.6], [0, -39.68, -0.6, 70.4, 39.68, -0.6], [0, -39.68, -1.78, 70.4, 39.68, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: configs/_base_/models/hv_pointpillars_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.32, 0.32, 6] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], voxel_size=voxel_size, max_voxels=(32000, 32000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[1, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], [-74.88, -74.88, 0, 74.88, 74.88, 0]], sizes=[ [2.08, 4.73, 1.77], # car [0.84, 1.81, 1.77], # cyclist [0.84, 0.91, 1.74] # pedestrian ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500))) ================================================ FILE: configs/_base_/models/hv_second_secfpn_kitti.py ================================================ model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=5, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=(16000, 40000)), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseEncoder', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: configs/_base_/models/hv_second_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.08, 0.08, 0.1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=10, point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4], voxel_size=voxel_size, max_voxels=(80000, 90000)), voxel_encoder=dict(type='HardSimpleVFE', num_features=5), middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[61, 1280, 1920], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=384, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345], [-76.8, -51.2, 0, 76.8, 51.2, 0], [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], sizes=[ [2.08, 4.73, 1.77], # car [0.84, 0.91, 1.74], # pedestrian [0.84, 1.81, 1.77] # cyclist ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1) ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500)) ================================================ FILE: configs/_base_/models/imvotenet_image.py ================================================ model = dict( type='ImVoteNet', img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), img_rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), img_roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0))), # model training and testing settings train_cfg=dict( img_rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), img_rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), img_rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), test_cfg=dict( img_rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), img_rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))) ================================================ FILE: configs/_base_/models/mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: configs/_base_/models/votenet.py ================================================ model = dict( type='VoteNet', backbone=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), test_cfg=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)) ================================================ FILE: configs/_base_/schedules/cyclic_20e.py ================================================ # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 20. Please change the interval accordingly if you do not # use a default schedule. # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # runtime settings total_epochs = 20 ================================================ FILE: configs/_base_/schedules/cyclic_40e.py ================================================ # The schedule is usually used by models trained on KITTI dataset # The learning rate set in the cyclic schedule is the initial learning rate # rather than the max learning rate. Since the target_ratio is (10, 1e-4), # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 lr = 0.0018 # The optimizer follows the setting in SECOND.Pytorch, but here we use # the offcial AdamW optimizer implemented by PyTorch. optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) # We use cyclic learning rate and momentum schedule following SECOND.Pytorch # https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa # We implement them in mmcv, for more details, please refer to # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # Although the total_epochs is 40, this schedule is usually used we # RepeatDataset with repeat ratio N, thus the actual total epoch # number could be Nx40 total_epochs = 40 ================================================ FILE: configs/_base_/schedules/mmdet_schedule_1x.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8, 11]) total_epochs = 12 ================================================ FILE: configs/_base_/schedules/schedule_2x.py ================================================ # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 1000, step=[20, 23]) momentum_config = None # runtime settings total_epochs = 24 ================================================ FILE: configs/_base_/schedules/schedule_3x.py ================================================ # optimizer # This schedule is mainly used by models on indoor dataset, # e.g., VoteNet on SUNRGBD and ScanNet lr = 0.008 # max learning rate optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict(policy='step', warmup=None, step=[24, 32]) # runtime settings total_epochs = 36 ================================================ FILE: configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py ================================================ # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z model = dict( type='PartA2', voxel_layer=dict( max_num_points=5, # max_points_per_voxel point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_coxels ), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseUNet', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), rpn_head=dict( type='PartA2RPNHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, assigner_per_size=True, assign_per_class=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), roi_head=dict( type='PartAggregationROIHead', num_classes=3, semantic_head=dict( type='PointwiseSemanticHead', in_channels=16, extra_width=0.2, seg_score_thr=0.3, num_classes=3, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), seg_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='max')), part_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='avg')), bbox_head=dict( type='PartA2BboxHead', num_classes=3, seg_in_channels=16, part_in_channels=4, seg_conv_channels=[64, 64], part_conv_channels=[64, 64], merge_conv_channels=[128, 128], down_conv_channels=[128, 256], bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), shared_fc_channels=[256, 512, 512, 512], cls_channels=[256, 256], reg_channels=[256, 256], dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1) ], allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=9000, nms_post=512, max_num=512, nms_thr=0.8, score_thr=0, use_rotate_nms=False), rcnn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1) ], sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.55, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.75, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_pre=1024, nms_post=100, max_num=100, nms_thr=0.7, score_thr=0, use_rotate_nms=True), rcnn=dict( use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.3))) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), classes=class_names, sample_groups=dict(Car=20, Pedestrian=15, Cyclist=15)) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # optimizer lr = 0.001 # max learning rate optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) checkpoint_config = dict(interval=1) evaluation = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 80 dist_params = dict(backend='nccl', port=29506) log_level = 'INFO' find_unused_parameters = True work_dir = './work_dirs/parta2_secfpn_80e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py ================================================ # model settings voxel_size = [0.16, 0.16, 4] point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=64, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(12000, 20000)), voxel_encoder=dict( type='PillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=point_cloud_range), middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), backbone=dict( type='SECOND', in_channels=64, layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), bbox_head=dict( type='Anchor3DHead', num_classes=1, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=True), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), sample_groups=dict(Car=15), classes=class_names) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, loc_noise_std=[0.25, 0.25, 0.25], global_rot_range=[0.0, 0.0], rot_uniform_noise=[-0.15707963267, 0.15707963267]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScale', rot_uniform_noise=[-0.78539816, 0.78539816], scaling_uniform_noise=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=3, workers_per_gpu=3, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # optimizer lr = 0.001 # max learning rate optimizer = dict( type='AdamW', lr=lr, betas=(0.95, 0.99), # the momentum is change during training weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) checkpoint_config = dict(interval=1) evaluation = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 50 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/pp_secfpn_100e' load_from = None resume_from = None workflow = [('train', 50)] ================================================ FILE: configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py ================================================ # model settings point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] voxel_size = [0.16, 0.16, 4] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=32, # max_points_per_voxel point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_coxels ), voxel_encoder=dict( type='PillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=point_cloud_range, ), middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[496, 432], ), backbone=dict( type='SECOND', in_channels=64, layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256], ), neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128], ), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), ), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( Car=5, Pedestrian=5, Cyclist=5, )), classes=class_names, sample_groups=dict( Car=15, Pedestrian=15, Cyclist=15, )) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']), ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # optimizer lr = 0.0003 # max learning rate optimizer = dict( type='AdamW', lr=lr, betas=(0.95, 0.99), # the momentum is change during training weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) # learning policy lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) checkpoint_config = dict(interval=1) evaluation = dict(interval=2) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 80 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/pp_secfpn_80e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py ================================================ # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=5, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000)), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseEncoder', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=False, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( Car=5, Pedestrian=5, Cyclist=5, )), classes=class_names, sample_groups=dict( Car=20, Pedestrian=15, Cyclist=15, )) file_client_args = dict(backend='disk') # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # optimizer lr = 0.0003 # max learning rate optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) checkpoint_config = dict(interval=1) evaluation = dict(interval=2) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 80 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/sec_secfpn_80e' load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/centerpoint/README.md ================================================ # Center-based 3D Object Detection and Tracking ## Introduction [ALGORITHM] We implement CenterPoint and provide the result and checkpoints on nuScenes dataset. We follow the below style to name config files. Contributors are advised to follow the same style. `{xxx}` is required field and `[yyy]` is optional. `{model}`: model type like `centerpoint`. `{model setting}`: voxel size and voxel type like `01voxel`, `02pillar`. `{backbone}`: backbone type like `second`. `{neck}`: neck type like `secfpn`. `[dcn]`: Whether to use deformable convolution. `[circle]`: Whether to use circular nms. `[batch_per_gpu x gpu]`: GPUs and samples per GPU, 4x8 is used by default. `{schedule}`: training schedule, options are 1x, 2x, 20e, etc. 1x and 2x means 12 epochs and 24 epochs respectively. 20e is adopted in cascade models, which denotes 20 epochs. For 1x/2x, initial learning rate decays by a factor of 10 at the 8/16th and 11/22th epochs. For 20e, initial learning rate decays by a factor of 10 at the 16th and 19th epochs. `{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively. ``` @article{yin2021center, title={Center-based 3D Object Detection and Tracking}, author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp}, journal={CVPR}, year={2021}, } ``` ## Usage ### Test time augmentation We have supported double-flip and scale augmentation during test time. To use test time augmentation, users need to modify the `test_pipeline` and `test_cfg` in the config. For example, we change `centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py` to the following. ```python _base_ = './centerpoint_0075voxel_second_secfpn_circlenms' \ '_4x8_cyclic_20e_nus.py' model = dict( test_cfg=dict( pts=dict( use_rotate_nms=True, max_num=83))) point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] file_client_args = dict(backend='disk') class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] test_pipeline = [ dict( type='LoadPointsFromFile', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=[0.95, 1.0, 1.05], flip=True, pcd_horizontal_flip=True, pcd_vertical_flip=True, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', sync_2d=False), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ``` ## Results ### CenterPoint |Backbone| Voxel type (voxel size) |Dcn|Circular nms| Mem (GB) | Inf time (fps) | mAP |NDS| Download | | :---------: |:-----: |:-----: | :------: | :------------: | :----: |:----: | :------: |:------: | |[SECFPN](./centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.1)|✗|✓|4.9| |56.19|64.43|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201001_135205-5db91e00.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201001_135205.log.json)| |above w/o circle nms|voxel (0.1)|✗|✗| | |56.56|64.46|| |[SECFPN](./centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.1)|✓|✓|5.2| |56.34|64.81|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20201004_075317-26d8176c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20201004_075317.log.json)| |above w/o circle nms|voxel (0.1)|✓|✗| | |56.60|64.90|| |[SECFPN](./centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.075)|✗|✓|7.8| |57.34|65.23|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20200925_230905-358fbe3b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20200925_230905.log.json)| |above w/o circle nms|voxel (0.075)|✗|✗| | |57.63|65.39| | |[SECFPN](./centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.075)|✓|✓|8.5| |57.27|65.58|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20200930_201619-67c8496f.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20200930_201619.log.json)| |above w/o circle nms|voxel (0.075)|✓|✗| | |57.43|65.63|| |above w/ double flip|voxel (0.075)|✓|✗| | |59.73|67.39|| |above w/ scale tta|voxel (0.075)|✓|✗| | |60.43|67.65|| |above w/ circle nms w/o scale tta|voxel (0.075)|✓|✗| | |59.52|67.24|| |[SECFPN](./centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|pillar (0.2)|✗|✓|4.4| |49.07|59.66|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201004_170716-a134a233.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201004_170716.log.json)| |above w/o circle nms|pillar (0.2)|✗|✗| | |49.12|59.66|| |[SECFPN](./centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py)|pillar (0.2)|✓|✗| 4.6| |48.8 |59.67 |[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722-3bb135f2.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722.log.json)| |above w/ circle nms|pillar (0.2)|✓|✓| | |48.79|59.65|| ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py'] # If point cloud range is changed, the models should also change their point # cloud range accordingly voxel_size = [0.075, 0.075, 0.2] point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] model = dict( pts_voxel_layer=dict( voxel_size=voxel_size, point_cloud_range=point_cloud_range), pts_middle_encoder=dict(sparse_shape=[41, 1440, 1440]), pts_bbox_head=dict( bbox_coder=dict( voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2])), train_cfg=dict( pts=dict( grid_size=[1440, 1440, 40], voxel_size=voxel_size, point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict(voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2]))) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') db_sampler = dict( data_root=data_root, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict(dataset=dict(pipeline=train_pipeline)), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict(test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3))) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_flip-tta_20e_nus.py ================================================ _base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py' point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] file_client_args = dict(backend='disk') class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, # Add double-flip augmentation flip=True, pcd_horizontal_flip=True, pcd_vertical_flip=True, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', sync_2d=False), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py ================================================ _base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py' test_cfg = dict(pts=dict(use_rotate_nms=True, max_num=500)) point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] file_client_args = dict(backend='disk') class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=[0.95, 1.0, 1.05], # Add double-flip augmentation flip=True, pcd_horizontal_flip=True, pcd_vertical_flip=True, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', sync_2d=False), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3)), test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_flip-tta_20e_nus.py ================================================ _base_ = './centerpoint_0075voxel_second_secfpn_dcn_' \ 'circlenms_4x8_cyclic_20e_nus.py' point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] file_client_args = dict(backend='disk') class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, # Add double-flip augmentation flip=True, pcd_horizontal_flip=True, pcd_vertical_flip=True, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', sync_2d=False), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py ================================================ _base_ = [ '../_base_/datasets/nus-3d.py', '../_base_/models/centerpoint_01voxel_second_secfpn_nus.py', '../_base_/schedules/cyclic_20e.py', '../_base_/default_runtime.py' ] # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] model = dict( pts_voxel_layer=dict(point_cloud_range=point_cloud_range), pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), # model training and testing settings train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') db_sampler = dict( data_root=data_root, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) evaluation = dict(interval=20) ================================================ FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict(test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3))) ================================================ FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3)), test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py ================================================ _base_ = [ '../_base_/datasets/nus-3d.py', '../_base_/models/centerpoint_02pillar_second_secfpn_nus.py', '../_base_/schedules/cyclic_20e.py', '../_base_/default_runtime.py' ] # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] model = dict( pts_voxel_layer=dict(point_cloud_range=point_cloud_range), pts_voxel_encoder=dict(point_cloud_range=point_cloud_range), pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), # model training and testing settings train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') db_sampler = dict( data_root=data_root, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, remove_close=True), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) evaluation = dict(interval=20) ================================================ FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict(test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3))) ================================================ FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py ================================================ _base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py'] model = dict( pts_bbox_head=dict( separate_head=dict( type='DCNSeparateHead', dcn_config=dict( type='DCN', in_channels=64, out_channels=64, kernel_size=3, padding=1, groups=4), init_bias=-2.19, final_kernel=3)), test_cfg=dict(pts=dict(nms_type='circle'))) ================================================ FILE: configs/dynamic_voxelization/README.md ================================================ # Dynamic Voxelization ## Introduction [ALGORITHM] We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset. ``` @article{zhou2019endtoend, title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds}, author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan}, year={2019}, eprint={1910.06528}, archivePrefix={arXiv}, primaryClass={cs.CV} } ``` ## Results ### KITTI | Model |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | | :---------: | :-----: |:-----: | :------: | :------------: | :----: | :------: | |[SECOND](./dv_second_secfpn_6x8_80e_kitti-3d-car.py)|Car |cyclic 80e|5.5||78.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)| |[SECOND](./dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py)| 3 Class|cosine 80e|5.5||65.10|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010-6aa607d3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010.log.json)| |[PointPillars](./dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)| Car|cyclic 80e|4.7||77.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)| ================================================ FILE: configs/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py ================================================ _base_ = '../pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py' voxel_size = [0.16, 0.16, 4] point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] model = dict( type='DynamicVoxelNet', voxel_layer=dict( max_num_points=-1, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(-1, -1)), voxel_encoder=dict( type='DynamicPillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=point_cloud_range)) ================================================ FILE: configs/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py ================================================ _base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py' point_cloud_range = [0, -40, -3, 70.4, 40, 1] voxel_size = [0.05, 0.05, 0.1] model = dict( type='DynamicVoxelNet', voxel_layer=dict( _delete_=True, max_num_points=-1, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(-1, -1)), voxel_encoder=dict( _delete_=True, type='DynamicSimpleVFE', voxel_size=voxel_size, point_cloud_range=point_cloud_range)) # optimizer lr = 0.003 # max learning rate optimizer = dict( _delete_=True, type='AdamW', lr=lr, betas=(0.95, 0.99), # the momentum is change during training weight_decay=0.001) lr_config = dict( _delete_=True, policy='CosineAnnealing', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10, min_lr_ratio=1e-5) momentum_config = None ================================================ FILE: configs/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car.py ================================================ _base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-car.py' point_cloud_range = [0, -40, -3, 70.4, 40, 1] voxel_size = [0.05, 0.05, 0.1] model = dict( type='DynamicVoxelNet', voxel_layer=dict( _delete_=True, max_num_points=-1, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(-1, -1)), voxel_encoder=dict( _delete_=True, type='DynamicSimpleVFE', voxel_size=voxel_size, point_cloud_range=point_cloud_range)) ================================================ FILE: configs/fp16/README.md ================================================ # Mixed Precision Training ## Introduction [OTHERS] We implement mixed precision training and apply it to VoxelNets (e.g., SECOND and PointPillars). The results are in the following tables. **Note**: For mixed precision training, we currently do not support PointNet-based methods (e.g., VoteNet). Mixed precision training for PointNet-based methods will be supported in the future release. ## Results ### SECOND on KITTI dataset | Backbone |Class| Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP16 mAP |Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | :------: | | [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4|2.9|79.07|78.72|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)| | [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4|2.9|64.41|67.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json)| ### PointPillars on nuScenes dataset | Backbone | Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP32 NDS| FP16 mAP | FP16 NDS| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :----: |:----: | :------: | |[SECFPN](./hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.37|35.17|49.7|35.19|50.27|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json)| |[FPN](./hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.40|40.0|53.3|39.26|53.26|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)| **Note**: 1. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2. This will cause OOM error without mixed precision training. 2. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes. ================================================ FILE: configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py ================================================ _base_ = '../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) ================================================ FILE: configs/fp16/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py ================================================ _base_ = '../regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) ================================================ FILE: configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py ================================================ _base_ = '../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py' data = dict(samples_per_gpu=2, workers_per_gpu=2) # fp16 settings, the loss scale is specifically tuned to avoid Nan fp16 = dict(loss_scale=32.) ================================================ FILE: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py ================================================ _base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py' # fp16 settings fp16 = dict(loss_scale=512.) ================================================ FILE: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py ================================================ _base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-car.py' # fp16 settings fp16 = dict(loss_scale=512.) ================================================ FILE: configs/free_anchor/README.md ================================================ # FreeAnchor for 3D Object Detection ## Introduction [ALGORITHM] We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset. With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance on the nuScenes benchmark. ``` @inproceedings{zhang2019freeanchor, title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection}, author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang}, booktitle = {Neural Information Processing Systems}, year = {2019} } ``` ## Usage ### Modify config As in the [baseline config](hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py), we only need to replace the head of an existing one-stage detector to use FreeAnchor head. Since the config is inherit from a common detector head, `_delete_=True` is necessary to avoid conflicts. The hyperparameters are specifically tuned according to the original paper. ```python _base_ = [ '../_base_/models/hv_pointpillars_fpn_lyft.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] model = dict( pts_bbox_head=dict( _delete_=True, type='FreeAnchor3DHead', num_classes=10, in_channels=256, feat_channels=256, use_direction_classifier=True, pre_anchor_topk=25, bbox_thr=0.5, gamma=2.0, alpha=0.5, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ [0.8660, 2.5981, 1.], # 1.5/sqrt(3) [0.5774, 1.7321, 1.], # 1/sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg = dict( pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25]))) ``` ## Results ### PointPillars | Backbone |FreeAnchor|Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download | | :---------: |:-----: |:-----: | :------: | :------------: | :----: |:----: | :------: | |[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|✗|2x|17.1||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)| |[FPN](./hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|16.2||43.7|55.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200628_210537-09d359fc.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200628_210537.log.json)| |[RegNetX-400MF-FPN](../regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py)|✗|2x|17.3||44.8|56.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)| |[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|17.7||47.9|58.6|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_050311-a334765d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_050311.log.json)| |[RegNetX-1.6GF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|24.3||51.2|60.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_105446-6ffa59cb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_105446.log.json)| |[RegNetX-1.6GF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|24.3||53.0|62.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200701_201531-036f7de3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200701_201531.log.json)| |[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|29.5||52.2|62.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_055854-658125b0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_055854.log.json)| |[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|29.5||55.09|63.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452-297fdc66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452.log.json)| **Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation. ================================================ FILE: configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] model = dict( pts_bbox_head=dict( _delete_=True, type='FreeAnchor3DHead', num_classes=10, in_channels=256, feat_channels=256, use_direction_classifier=True, pre_anchor_topk=25, bbox_thr=0.5, gamma=2.0, alpha=0.5, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ [0.8660, 2.5981, 1.], # 1.5/sqrt(3) [0.5774, 1.7321, 1.], # 1/sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25]))) ================================================ FILE: configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py ================================================ _base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py' model = dict( pretrained=dict(pts='open-mmlab://regnetx_1.6gf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_1.6gf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[168, 408, 912])) ================================================ FILE: configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py ================================================ _base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py' model = dict( pretrained=dict(pts='open-mmlab://regnetx_1.6gf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_1.6gf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[168, 408, 912])) # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] # file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. file_client_args = dict( backend='petrel', path_mapping=dict({ './data/nuscenes/': 's3://nuscenes/nuscenes/', 'data/nuscenes/': 's3://nuscenes/nuscenes/' })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.7854, 0.7854], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict( type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] data = dict(train=dict(pipeline=train_pipeline)) lr_config = dict(step=[28, 34]) evaluation = dict(interval=36) total_epochs = 36 ================================================ FILE: configs/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py ================================================ _base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py' model = dict( pretrained=dict(pts='open-mmlab://regnetx_3.2gf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_3.2gf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[192, 432, 1008])) ================================================ FILE: configs/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py ================================================ _base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py' model = dict( pretrained=dict(pts='open-mmlab://regnetx_3.2gf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_3.2gf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[192, 432, 1008])) # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] # file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. file_client_args = dict( backend='petrel', path_mapping=dict({ './data/nuscenes/': 's3://nuscenes/nuscenes/', 'data/nuscenes/': 's3://nuscenes/nuscenes/' })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.7854, 0.7854], scale_ratio_range=[0.9, 1.1], translation_std=[0.2, 0.2, 0.2]), dict( type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] data = dict(train=dict(pipeline=train_pipeline)) lr_config = dict(step=[28, 34]) evaluation = dict(interval=36) total_epochs = 36 ================================================ FILE: configs/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py ================================================ _base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py' model = dict( pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_400mf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) ================================================ FILE: configs/h3dnet/README.md ================================================ # H3DNet: 3D Object Detection Using Hybrid Geometric Primitives ## Introduction [ALGORITHM] We implement H3DNet and provide the result and checkpoints on ScanNet datasets. ``` @inproceedings{zhang2020h3dnet, author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing}, title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives}, booktitle = {Proceedings of the European Conference on Computer Vision}, year = {2020} } ``` ## Results ### ScanNet | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [MultiBackbone](./h3dnet_3x8_scannet-3d-18class.py) | 3x |7.9||66.43|48.01|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_scannet-3d-18class_20200830_000136-02e36246.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_scannet-3d-18class_20200830_000136.log.json) | ================================================ FILE: configs/h3dnet/h3dnet_3x8_scannet-3d-18class.py ================================================ _base_ = [ '../_base_/datasets/scannet-3d-18class.py', '../_base_/models/h3dnet.py', '../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py' ] # model settings model = dict( rpn_head=dict( num_classes=18, bbox_coder=dict( type='PartialBinBasedBBoxCoder', num_sizes=18, num_dir_bins=24, with_rot=False, mean_sizes=[[0.76966727, 0.8116021, 0.92573744], [1.876858, 1.8425595, 1.1931566], [0.61328, 0.6148609, 0.7182701], [1.3955007, 1.5121545, 0.83443564], [0.97949594, 1.0675149, 0.6329687], [0.531663, 0.5955577, 1.7500148], [0.9624706, 0.72462326, 1.1481868], [0.83221924, 1.0490936, 1.6875663], [0.21132214, 0.4206159, 0.5372846], [1.4440073, 1.8970833, 0.26985747], [1.0294262, 1.4040797, 0.87554324], [1.3766412, 0.65521795, 1.6813129], [0.6650819, 0.71111923, 1.298853], [0.41999173, 0.37906948, 1.7513971], [0.59359556, 0.5912492, 0.73919016], [0.50867593, 0.50656086, 0.30136237], [1.1511526, 1.0546296, 0.49706793], [0.47535285, 0.49249494, 0.5802117]])), roi_head=dict( bbox_head=dict( num_classes=18, bbox_coder=dict( type='PartialBinBasedBBoxCoder', num_sizes=18, num_dir_bins=24, with_rot=False, mean_sizes=[[0.76966727, 0.8116021, 0.92573744], [1.876858, 1.8425595, 1.1931566], [0.61328, 0.6148609, 0.7182701], [1.3955007, 1.5121545, 0.83443564], [0.97949594, 1.0675149, 0.6329687], [0.531663, 0.5955577, 1.7500148], [0.9624706, 0.72462326, 1.1481868], [0.83221924, 1.0490936, 1.6875663], [0.21132214, 0.4206159, 0.5372846], [1.4440073, 1.8970833, 0.26985747], [1.0294262, 1.4040797, 0.87554324], [1.3766412, 0.65521795, 1.6813129], [0.6650819, 0.71111923, 1.298853], [0.41999173, 0.37906948, 1.7513971], [0.59359556, 0.5912492, 0.73919016], [0.50867593, 0.50656086, 0.30136237], [1.1511526, 1.0546296, 0.49706793], [0.47535285, 0.49249494, 0.5802117]])))) data = dict(samples_per_gpu=3, workers_per_gpu=2) # optimizer # yapf:disable log_config = dict( interval=30, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable ================================================ FILE: configs/imvotenet/README.md ================================================ # ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes ## Introduction [ALGORITHM] We implement ImVoteNet and provide the result and checkpoints on SUNRGBD. ``` @inproceedings{qi2020imvotenet, title={Imvotenet: Boosting 3D object detection in point clouds with image votes}, author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J}, booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, pages={4404--4413}, year={2020} } ``` ## Results ### SUNRGBD-2D (Stage 1, image branch pre-train) | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py) | |2.1| ||62.70|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222.log.json)| ### SUNRGBD-3D (Stage 2) | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./imvotenet_stage2_16x8_sunrgbd-3d-10class.py) | 3x |9.4| |64.04||[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021-d44dcb66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021.log.json)| ================================================ FILE: configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py ================================================ _base_ = [ '../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/default_runtime.py', '../_base_/models/imvotenet_image.py' ] # use caffe img_norm img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='Resize', img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552), (1333, 576), (1333, 600)], multiscale_mode='value', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 600), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict(times=1, dataset=dict(pipeline=train_pipeline)), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[6]) total_epochs = 8 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa ================================================ FILE: configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py ================================================ _base_ = [ '../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py', '../_base_/models/imvotenet_image.py' ] class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') # use caffe img_norm img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) model = dict( pts_backbone=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), pts_bbox_heads=dict( common=dict( type='VoteHead', num_classes=10, bbox_coder=dict( type='PartialBinBasedBBoxCoder', num_sizes=10, num_dir_bins=12, with_rot=True, mean_sizes=[[2.114256, 1.620300, 0.927272], [0.791118, 1.279516, 0.718182], [0.923508, 1.867419, 0.845495], [0.591958, 0.552978, 0.827272], [0.699104, 0.454178, 0.75625], [0.69519, 1.346299, 0.736364], [0.528526, 1.002642, 1.172878], [0.500618, 0.632163, 0.683424], [0.404671, 1.071108, 1.688889], [0.76584, 1.398258, 0.472728]]), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), joint=dict( vote_module_cfg=dict( in_channels=512, vote_per_seed=1, gt_per_seed=3, conv_channels=(512, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[512, 128, 128, 128], use_xyz=True, normalize_xyz=True)), pts=dict( vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True)), img=dict( vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True)), loss_weights=[0.4, 0.3, 0.3]), img_mlp=dict( in_channel=18, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU')), fusion_layer=dict( type='VoteFusion', num_classes=len(class_names), max_imvote_per_pixel=3), num_sampled_seed=1024, freeze_img_branch=True, # model training and testing settings train_cfg=dict( pts=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')), test_cfg=dict( img_rcnn=dict(score_thr=0.1), pts=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True))) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict(type='LoadImageFromFile'), dict(type='LoadAnnotations3D'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 600), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.0), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict( type='GlobalRotScaleTrans', rot_range=[-0.523599, 0.523599], scale_ratio_range=[0.85, 1.15], shift_height=True), dict(type='IndoorPointSample', num_points=20000), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'calib' ]) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 600), pts_scale_ratio=1, flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.0), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict(type='IndoorPointSample', num_points=20000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img', 'points', 'calib']) ]), ] data = dict( train=dict(dataset=dict(pipeline=train_pipeline)), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) # may also use your own pre-trained image branch load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth' # noqa ================================================ FILE: configs/mvxnet/README.md ================================================ # MVX-Net: Multimodal VoxelNet for 3D Object Detection ## Introduction [ALGORITHM] We implement MVX-Net and provide its results and models on KITTI dataset. ``` @inproceedings{sindagi2019mvx, title={MVX-Net: Multimodal voxelnet for 3D object detection}, author={Sindagi, Vishwanath A and Zhou, Yin and Tuzel, Oncel}, booktitle={2019 International Conference on Robotics and Automation (ICRA)}, pages={7276--7282}, year={2019}, organization={IEEE} } ``` ## Results ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [SECFPN](./dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py)|3 Class|cosine 80e|6.7||63.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904-10140f2d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904.log.json)| ================================================ FILE: configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py ================================================ # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='DynamicMVXFasterRCNN', img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=-1, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(-1, -1), ), pts_voxel_encoder=dict( type='DynamicVFE', in_channels=4, feat_channels=[64, 64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=point_cloud_range, fusion_layer=dict( type='PointFusion', img_channels=256, pts_channels=64, mid_channels=128, out_channels=128, img_levels=[0, 1, 2, 3, 4], align_corners=False, activate_out=True, fuse_out=False)), pts_middle_encoder=dict( type='SparseEncoder', in_channels=128, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), pts_backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), pts_bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), assigner_per_size=True, diff_rad_by_sin=True, assign_per_class=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50))) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) input_modality = dict(use_lidar=True, use_camera=True) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadImageFromFile'), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='Resize', img_scale=[(640, 192), (2560, 768)], multiscale_mode='range', keep_ratio=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']), ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug3D', img_scale=(1280, 384), pts_scale_ratio=1, flip=False, transforms=[ dict(type='Resize', multiscale_mode='value', keep_ratio=True), dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # Training settings optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10, min_lr_ratio=1e-5) momentum_config = None checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable evaluation = dict(interval=1) # runtime settings total_epochs = 40 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None # You may need to download the model first is the network is unstable load_from = 'https://download.openmmlab.com/mmdetection3d/pretrain_models/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth' # noqa resume_from = None workflow = [('train', 1)] ================================================ FILE: configs/nuimages/README.md ================================================ # NuImages Results ## Introduction [DATASET] We support and provide some baseline results on [nuImages dataset](https://www.nuscenes.org/nuimages). We follow the class mapping in nuScenes dataset, which maps the original categories into 10 foreground categories. The convert script can be found [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuimage_converter.py). The baseline results include instance segmentation models, e.g., Mask R-CNN, Cascade Mask R-CNN, and HTC. We will support panoptic segmentation models in the future. ![demo image](../../resources/nuimages_demo.gif) The dataset converted by the script of v0.6.0 only supports instance segmentation. Since v0.7.0, we also support to produce semantic segmentation mask of each image; thus, we can train HTC or semantic segmentation models using the dataset. To convert the nuImages dataset into COCO format, please use the command below: ```shell python -u tools/data_converter/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERIONS} \ --out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG} ``` - `--data-root`: the root of the dataset, defaults to `./data/nuimages`. - `--version`: the version of the dataset, defaults to `v1.0-mini`. To get the full dataset, please use `--version v1.0-train v1.0-val v1.0-mini` - `--out-dir`: the output directory of annotations and semantic masks, defaults to `./data/nuimages/annotations/`. - `--nproc`: number of workers for data preparation, defaults to `4`. Larger number could reduce the preparation time as images are processed in parallel. - `--extra-tag`: extra tag of the annotations, defaults to `nuimages`. This can be used to separate different annotations processed in different time for study. ## Results ### Instance Segmentation We report Mask R-CNN and Cascade Mask R-CNN results on nuimages. |Method | Backbone|Pretraining | Lr schd | Mem (GB) | Box AP | Mask AP |Download | | :---------: |:---------: | :---------: | :-----: |:-----: | :------: | :------------: | :----: | | Mask R-CNN| [R-50](./mask_rcnn_r50_fpn_1x_nuim.py) |IN|1x|7.4|47.8 |38.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238-e99f5182.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238.log.json)| | Mask R-CNN| [R-50](./mask_rcnn_r50_fpn_coco-2x_1x_nuim.py) |IN+COCO-2x|1x|7.4|49.7|40.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238-b1742a60.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238.log.json)| | Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_1x_nuim.py) |IN|1x|7.0|47.7|38.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/)| | Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py) |IN+COCO-3x|1x|7.0|49.9|40.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305-661a992e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305.log.json)| | Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py) |IN+COCO-3x|20e|7.0|50.6|41.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002-5529442c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002.log.json)| | Mask R-CNN| [R-101](./mask_rcnn_r101_fpn_1x_nuim.py) |IN|1x|10.9|48.9|39.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803-65c7623a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803.log.json)| | Mask R-CNN| [X-101_32x4d](./mask_rcnn_x101_32x4d_fpn_1x_nuim.py) |IN|1x|13.3|50.4|40.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741-b699ab37.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741.log.json)| | Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_1x_nuim.py) |IN|1x|8.9|50.8|40.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342-1147c036.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342.log.json)| | Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py) |IN+COCO-20e|1x|8.9|52.8|42.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158-ad0540e3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158.log.json)| | Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py) |IN+COCO-20e|20e|8.9|52.8|42.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951.log.json)| | Cascade Mask R-CNN| [R-101](./cascade_mask_rcnn_r101_fpn_1x_nuim.py) |IN|1x|12.5|51.5|40.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804.log.json)| | Cascade Mask R-CNN| [X-101_32x4d](./cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py) |IN|1x|14.9|52.8|41.6|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753-e0e49778.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753.log.json)| | HTC w/o semantic|[R-50](./htc_without_semantic_r50_fpn_1x_nuim.py) |IN|1x||[model]() | [log]()| | HTC|[R-50](./htc_r50_fpn_1x_nuim.py) |IN|1x||[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/)| | HTC|[R-50](./htc_r50_fpn_coco-20e_1x_nuim.py) |IN+COCO-20e|1x|11.6|53.8|43.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203-0b53a65e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203.log.json)| | HTC|[R-50](./htc_r50_fpn_coco-20e_20e_nuim.py) |IN+COCO-20e|20e|11.6|54.8|44.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415-d6c60a2c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415.log.json)| | HTC|[X-101_64x4d + DCN_c3-c5](./htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py) |IN+COCO-20e|20e|13.3|57.3|46.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222-0b16ac4b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222.log.json)| **Note**: 1. `IN` means only using ImageNet pre-trained backbone. `IN+COCO-Nx` and `IN+COCO-Ne` means the backbone is first pre-trained on ImageNet, and then the detector is pre-trained on COCO train2017 dataset by `Nx` and `N` epochs schedules, respectively. 2. All the training hyper-parameters follow the standard schedules on COCO dataset except that the images are resized from 1280 x 720 to 1920 x 1080 (relative ratio 0.8 to 1.2) since the images are in size 1600 x 900. 3. The class order in the detectors released in v0.6.0 is different from the order in the configs because the bug in the convertion script. This bug has been fixed since v0.7.0 and the models trained by the correct class order are also released. If you used nuImages since v0.6.0, please re-convert the data through the convertion script using the above-mentioned command. ================================================ FILE: configs/nuimages/cascade_mask_rcnn_r101_fpn_1x_nuim.py ================================================ _base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py' model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101)) ================================================ FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_1x_nuim.py ================================================ _base_ = [ '../_base_/models/cascade_mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( roi_head=dict( bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_head=dict(num_classes=10))) ================================================ FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py ================================================ _base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py' load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth' # noqa ================================================ FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py ================================================ _base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py' # learning policy lr_config = dict(step=[16, 19]) total_epochs = 20 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth' # noqa ================================================ FILE: configs/nuimages/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py ================================================ _base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py' model = dict( pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), style='pytorch')) ================================================ FILE: configs/nuimages/htc_r50_fpn_1x_nuim.py ================================================ _base_ = './htc_without_semantic_r50_fpn_1x_nuim.py' model = dict( roi_head=dict( semantic_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[8]), semantic_head=dict( type='FusedSemanticHead', num_ins=5, fusion_level=1, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=32, ignore_label=0, loss_weight=0.2))) data_root = 'data/nuimages/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='SegRescale', scale_factor=1 / 8), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']) ] data = dict( train=dict( seg_prefix=data_root + 'annotations/semantic_masks/', pipeline=train_pipeline)) ================================================ FILE: configs/nuimages/htc_r50_fpn_coco-20e_1x_nuim.py ================================================ _base_ = './htc_r50_fpn_1x_nuim.py' load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth' # noqa ================================================ FILE: configs/nuimages/htc_r50_fpn_coco-20e_20e_nuim.py ================================================ _base_ = './htc_r50_fpn_coco-20e_1x_nuim.py' # learning policy lr_config = dict(step=[16, 19]) total_epochs = 20 ================================================ FILE: configs/nuimages/htc_without_semantic_r50_fpn_1x_nuim.py ================================================ _base_ = [ '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] # model settings model = dict( type='HybridTaskCascade', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), roi_head=dict( type='HybridTaskCascadeRoIHead', interleaved=True, mask_info_flow=True, num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=[ dict( type='HTCMaskHead', with_conv_res=False, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=10, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=10, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=10, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)) ]), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py ================================================ _base_ = './htc_r50_fpn_1x_nuim.py' model = dict( pretrained='open-mmlab://resnext101_64x4d', backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, True, True, True))) data = dict(samples_per_gpu=1, workers_per_gpu=1) # learning policy lr_config = dict(step=[16, 19]) total_epochs = 20 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth' # noqa ================================================ FILE: configs/nuimages/mask_rcnn_r101_fpn_1x_nuim.py ================================================ _base_ = './mask_rcnn_r50_fpn_1x_nuim.py' model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101)) ================================================ FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_1x_nuim.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( pretrained='open-mmlab://detectron2/resnet50_caffe', backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'), roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) # use caffe img_norm img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( train=dict(pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( pretrained='open-mmlab://detectron2/resnet50_caffe', backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'), roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) # use caffe img_norm img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( train=dict(pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa ================================================ FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( pretrained='open-mmlab://detectron2/resnet50_caffe', backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'), roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) # use caffe img_norm img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( train=dict(pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) # learning policy lr_config = dict(step=[16, 19]) total_epochs = 20 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa ================================================ FILE: configs/nuimages/mask_rcnn_r50_fpn_1x_nuim.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) ================================================ FILE: configs/nuimages/mask_rcnn_r50_fpn_coco-2x_1x_nuim.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth' # noqa ================================================ FILE: configs/nuimages/mask_rcnn_r50_fpn_coco-2x_1x_nus-2d.py ================================================ _base_ = [ '../_base_/models/mask_rcnn_r50_fpn.py', '../_base_/datasets/nuim_instance.py', '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' ] model = dict( roi_head=dict( bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10))) file_client_args = dict( backend='petrel', path_mapping=dict({ './data/nuscenes/': 's3://nuscenes/nuscenes/', 'data/nuscenes/': 's3://nuscenes/nuscenes/' })) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) test_pipeline = [ dict(type='LoadImageFromFile', file_client_args=file_client_args), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data_root = 'data/nuimages/' # data = dict( # val=dict( # ann_file=data_root + 'annotations/nuimages_v1.0-mini.json'), # test=dict( # ann_file=data_root + 'annotations/nuimages_v1.0-mini.json')) ================================================ FILE: configs/nuimages/mask_rcnn_swinT_coco-2x_1x_nuim.py ================================================ _base_ = [ '../_base_/datasets/nuim_instance.py', '../_base_/default_runtime.py' ] model = dict( type='MaskRCNN', backbone=dict( type='SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False, convert_weights=True, ), neck=dict( type='FPN', in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=10, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) load_from = '/data/yc_code/ImplicitFusion/checkpoints/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth' # noqa data = dict( samples_per_gpu=2, workers_per_gpu=4 ) # optimizer optimizer = dict( type='AdamW', lr=0.000025, betas=(0.9, 0.999), weight_decay=0.05, paramwise_cfg=dict( custom_keys={ 'absolute_pos_embed': dict(decay_mult=0.), 'relative_position_bias_table': dict(decay_mult=0.), 'norm': dict(decay_mult=0.) })) lr_config = dict(policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[8, 11]) optimizer_config = dict(grad_clip=None) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/nuimages/mask_rcnn_x101_32x4d_fpn_1x_nuim.py ================================================ _base_ = './mask_rcnn_r50_fpn_1x_nuim.py' model = dict( pretrained='open-mmlab://resnext101_32x4d', backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), style='pytorch')) ================================================ FILE: configs/nuscenes.md ================================================ # MODEL ZOO ## Common settings and notes - The experiments are run with PyTorch 1.7.0, CUDA 10.1 and CUDNN 7.6 - The training is conducted on 8 Tesla V100 GPUs - For the *fade strategy* proposed by PointAugmenting(disenable the copy-and-paste augmentation for the last 5 epochs), we currently implement this strategy by manually stop training at 15 epoch and resume the training without copy-and-paste augmentation. If you find more elegant ways to implement such strategy, please let we know and we really appreciate it. The fade strategy reduces lots of false positive, improving the mAP remarkably especially for TransFusion-L while having less influence on TransFusion. ## Pretrained 2D Backbones - DLA34: Following PointAugmenting, we directly reuse the checkpoints pretrained on monocular 3D detection task provided by [CenterNet]((https://github.com/xingyizhou/CenterTrack/blob/master/readme/MODEL_ZOO.md#monocular-3d-detection-tracking)). - ResNet50 on instance segmentation: We acquire the model pretrained on nuImages from [MMDetection3D](https://github.com/open-mmlab/mmdetection3d/blob/v0.12.0/configs/nuimages/README.md). - ResNet50 on 2D detection: We train a model using the [config](https://github.com/open-mmlab/mmdetection3d/blob/v0.12.0/configs/nuimages/mask_rcnn_r50_fpn_1x_nuim.py) of instance segmentation but remove the mask head. ## nuScenes 3D Detection All the LiDAR-only models are trained in 20 epochs, the fusion-based models are further trained for 6 epochs from the pretrained LiDAR backbone. We freeze the weight of LiDAR backbone to save GPU memory. | Model | Backbone | mAP | NDS | |---------|--------|--------|---------| | [TransFusion-L](configs/transfusion_nusc_pillar_L.py) | PointPillars | 54.51 | 62.66 | | [TransFusion](configs/transfusion_nusc_pillar_LC.py) | PointPillars | 60.21 | 65.50 | | [TransFusion-L](configs/transfusion_nusc_voxel_L.py) | VoxelNet | 65.06 | 70.10 | | [TransFusion](configs/transfusion_nusc_voxel_LC.py) | VoxelNet | 67.49 | 71.28 | ## nuScenes 3D Tracking We perform tracking-by-detection with the same tracking algorithms proposed by CenterPoint. | Model | Backbone | AMOTA | AMOTP | |---------|--------|--------|---------| | [TransFusion-L](configs/transfusion_nusc_voxel_L.py) | VoxelNet | 0.703 | 0.553 | | [TransFusion](configs/transfusion_nusc_voxel_LC.py) | VoxelNet | 0.725 | 0.561 | ## nuScenes Leaderboard ### Detection We use 300 object queries during inference for online submission for a slightly better performance. We do not use any test-time-augmentation and model ensemble. | Model | Backbone | Test mAP | Test NDS | Link | |---------|--------|--------|---------|---------| | TransFusion-L | VoxelNet | 65.52 | 70.23 | [Detection](https://drive.google.com/file/d/1Wk8p2LJEhwfKfhsKzlU9vDBOd0zn38dN/view?usp=sharing) | TransFusion | VoxelNet | 68.90 | 71.68 | [Detection](https://drive.google.com/file/d/1X7_ig4v5A2vKsiHtUGtgeMN-0RJKsM6W/view?usp=sharing) ### Tracking | Model | Backbone | Test AMOTA | Test AMOTP | Link | |---------|--------|--------|---------|---------| | TranFusion-L| VoxelNet | 0.686 | 0.529 | [Detection](https://drive.google.com/file/d/1Wk8p2LJEhwfKfhsKzlU9vDBOd0zn38dN/view?usp=sharing) / [Tracking](https://drive.google.com/file/d/1pKvRBUsM9h1Xgturd0Ae_bnGt0m_j3hk/view?usp=sharing)| | TranFusion| VoxelNet | 0.718 | 0.551 | [Detection](https://drive.google.com/file/d/1X7_ig4v5A2vKsiHtUGtgeMN-0RJKsM6W/view?usp=sharing) / [Tracking](https://drive.google.com/file/d/1EVuS-MAg_HSXUVqMrXEs4-RpZp0p5cfv/view?usp=sharing)| ================================================ FILE: configs/parta2/README.md ================================================ # From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network ## Introduction [ALGORITHM] We implement Part-A^2 and provide its results and checkpoints on KITTI dataset. ``` @article{shi2020points, title={From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network}, author={Shi, Shaoshuai and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, year={2020}, publisher={IEEE} } ``` ## Results ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download | | :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: | | [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py) |3 Class|cyclic 80e|4.1||67.9|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724-a2672098.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724.log.json)| | [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py) |Car |cyclic 80e|4.0||79.16|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755-f2a38b9a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755.log.json)| ================================================ FILE: configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py ================================================ _base_ = ['../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py'] # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='PartA2', voxel_layer=dict( max_num_points=5, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000)), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseUNet', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), rpn_head=dict( type='PartA2RPNHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, assigner_per_size=True, assign_per_class=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), roi_head=dict( type='PartAggregationROIHead', num_classes=3, semantic_head=dict( type='PointwiseSemanticHead', in_channels=16, extra_width=0.2, seg_score_thr=0.3, num_classes=3, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), seg_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='max')), part_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='avg')), bbox_head=dict( type='PartA2BboxHead', num_classes=3, seg_in_channels=16, part_in_channels=4, seg_conv_channels=[64, 64], part_conv_channels=[64, 64], merge_conv_channels=[128, 128], down_conv_channels=[128, 256], bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), shared_fc_channels=[256, 512, 512, 512], cls_channels=[256, 256], reg_channels=[256, 256], dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1) ], allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=9000, nms_post=512, max_num=512, nms_thr=0.8, score_thr=0, use_rotate_nms=False), rcnn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1) ], sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.55, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.75, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_pre=1024, nms_post=100, max_num=100, nms_thr=0.7, score_thr=0, use_rotate_nms=True), rcnn=dict( use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6)) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True)) # Part-A2 uses a different learning rate from what SECOND uses. lr = 0.001 optimizer = dict(lr=lr) find_unused_parameters = True ================================================ FILE: configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py ================================================ _base_ = './hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py' voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z model = dict( rpn_head=dict( type='PartA2RPNHead', num_classes=1, anchor_generator=dict( _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False)), roi_head=dict( num_classes=1, semantic_head=dict(num_classes=1), bbox_head=dict(num_classes=1)), # model training and testing settings train_cfg=dict( _delete_=True, rpn=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=9000, nms_post=512, max_num=512, nms_thr=0.8, score_thr=0, use_rotate_nms=False), rcnn=dict( assigner=dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.55, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.75, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_pre=1024, nms_post=100, max_num=100, nms_thr=0.7, score_thr=0, use_rotate_nms=True), rcnn=dict( use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15)) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict(dataset=dict(pipeline=train_pipeline, classes=class_names)), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) find_unused_parameters = True ================================================ FILE: configs/pointpillars/README.md ================================================ # PointPillars: Fast Encoders for Object Detection from Point Clouds ## Introduction [ALGORITHM] We implement PointPillars and provide the results and checkpoints on KITTI, nuScenes, Lyft and Waymo datasets. ``` @inproceedings{lang2019pointpillars, title={Pointpillars: Fast encoders for object detection from point clouds}, author={Lang, Alex H and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar}, booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, pages={12697--12705}, year={2019} } ``` ## Results ### KITTI | Backbone|Class | Lr schd | Mem (GB) | Inf time (fps) | AP |Download | | :---------: | :-----: |:-----: | :------: | :------------: | :----: | :------: | | [SECFPN](./hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)|Car|cyclic 160e|5.4||77.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614-77663cd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614.log.json)| | [SECFPN](./hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py)|3 Class|cyclic 160e|5.5||59.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20200620_230421-aa0f3adb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20200620_230421.log.json)| ### nuScenes | Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](./hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)| |[FPN](./hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)| ### Lyft | Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](./hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4|| |[FPN](./hv_pointpillars_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||14.0|14.2|| ### Waymo | Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download | | :-------: | :-----------: |:-----:| :------:| :------: | :------------: | :----: | :-----: | :-----: | :-----: | :------: | | [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py)|5|Car|2x|7.76||70.2|69.6|62.6|62.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315-302fc3e7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315.log.json)| | [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py)|5|3 Class|2x|8.12||64.7|57.6|58.4|52.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144-d1a706b1.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144.log.json)| | above @ Car|||2x|8.12||68.5|67.9|60.1|59.6| | | above @ Pedestrian|||2x|8.12||67.8|50.6|59.6|44.3| | | above @ Cyclist|||2x|8.12||57.7|54.4|55.5|52.4| | | [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py)|1|Car|2x|7.76||72.1|71.5|63.6|63.1|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.log.json)| | [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py)|1|3 Class|2x|8.12||68.8|63.3|62.6|57.6|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.log.json)| | above @ Car|||2x|8.12||71.6|71.0|63.1|62.5| | | above @ Pedestrian|||2x|8.12||70.6|56.7|62.9|50.2| | | above @ Cyclist|||2x|8.12||64.4|62.3|61.9|59.9| | #### Note: - **Metric**: For model trained with 3 classes, the average APH@L2 (mAPH@L2) of all the categories is reported and used to rank the model. For model trained with only 1 class, the APH@L2 is reported and used to rank the model. - **Data Split**: Here we provide several baselines for waymo dataset, among which D5 means that we divide the dataset into 5 folds and only use one fold for efficient experiments. Using the complete dataset can boost the performance a lot, especially for the detection of cyclist and pedestrian, where more than 5 mAP or mAPH improvement can be expected. - **Implementation Details**: We basically follow the implementation in the [paper](https://arxiv.org/pdf/1912.04838.pdf) in terms of the network architecture (having a stride of 1 for the first convolutional block). Different settings of voxelization, data augmentation and hyper parameters make these baselines outperform those in the paper by about 7 mAP for car and 4 mAP for pedestrian with only a subset of the whole dataset. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation. - **License Agreement**: To comply the [license agreement of Waymo dataset](https://waymo.com/open/terms/), the pre-trained models on Waymo dataset are not released. We still release the training log as a reference to ease the future research. ================================================ FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_lyft.py', '../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] ================================================ FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] ================================================ FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_range100_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_range100_lyft.py', '../_base_/datasets/range100_lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_secfpn_kitti.py', '../_base_/datasets/kitti-3d-3class.py', '../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py' ] point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] # dataset settings data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] # PointPillars adopted a different sampling strategies among classes db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10)) # PointPillars uses different augmentation hyper parameters train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[0.25, 0.25, 0.25], global_rot_range=[0.0, 0.0], rot_range=[-0.15707963267, 0.15707963267]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict(dataset=dict(pipeline=train_pipeline, classes=class_names)), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) # In practice PointPillars also uses a different schedule # optimizer lr = 0.001 optimizer = dict(lr=lr) # max_norm=35 is slightly better than 10 for PointPillars in the earlier # development of the codebase thus we keep the setting. But we does not # specifically tune this parameter. optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # Use evaluation interval=2 reduce the number of evaluation timese evaluation = dict(interval=2) # PointPillars usually need longer schedule than second, we simply double # the training schedule. Do remind that since we use RepeatDataset and # repeat factor is 2, so we actually train 160 epochs. total_epochs = 80 ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py ================================================ # model settings _base_ = './hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py' point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] model = dict( bbox_head=dict( type='Anchor3DHead', num_classes=1, anchor_generator=dict( _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings train_cfg=dict( _delete_=True, assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), allowed_border=0, pos_weight=-1, debug=False)) # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), sample_groups=dict(Car=15), classes=class_names) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[0.25, 0.25, 0.25], global_rot_range=[0.0, 0.0], rot_range=[-0.15707963267, 0.15707963267]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( train=dict( type='RepeatDataset', times=2, dataset=dict(pipeline=train_pipeline, classes=class_names)), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_lyft.py', '../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( pts_neck=dict( _delete_=True, type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024], [-80, -80, -0.3033737, 80, 80, -0.3033737], [-80, -80, -0.3519405, 80, 80, -0.3519405], [-80, -80, -0.8871424, 80, 80, -0.8871424], [-80, -80, -0.6276341, 80, 80, -0.6276341], [-80, -80, -1.3220503, 80, 80, -1.3220503], [-80, -80, -1.0709302, 80, 80, -1.0709302], [-80, -80, -0.9122268, 80, 80, -0.9122268], [-80, -80, -1.8012227, 80, 80, -1.8012227]], sizes=[ [1.92, 4.75, 1.71], # car [2.84, 10.24, 3.44], # truck [2.92, 12.70, 3.42], # bus [2.42, 6.52, 2.34], # emergency vehicle [2.75, 8.17, 3.20], # other vehicle [0.96, 2.35, 1.59], # motorcycle [0.63, 1.76, 1.44], # bicycle [0.76, 0.80, 1.76], # pedestrian [0.35, 0.73, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( pts_neck=dict( _delete_=True, type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[ [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795], [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365], [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504], [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111], [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072], [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986], [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965], ], sizes=[ [1.95017717, 4.60718145, 1.72270761], # car [2.4560939, 6.73778078, 2.73004906], # truck [2.87427237, 12.01320693, 3.81509561], # trailer [0.60058911, 1.68452161, 1.27192197], # bicycle [0.66344886, 0.7256437, 1.75748069], # pedestrian [0.39694519, 0.40359262, 1.06232151], # traffic_cone [2.49008838, 0.48578221, 0.98297065], # barrier ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_range100_lyft.py', '../_base_/datasets/range100_lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' ] # model settings model = dict( pts_neck=dict( _delete_=True, type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024], [-100, -100, -0.3033737, 100, 100, -0.3033737], [-100, -100, -0.3519405, 100, 100, -0.3519405], [-100, -100, -0.8871424, 100, 100, -0.8871424], [-100, -100, -0.6276341, 100, 100, -0.6276341], [-100, -100, -1.3220503, 100, 100, -1.3220503], [-100, -100, -1.0709302, 100, 100, -1.0709302], [-100, -100, -0.9122268, 100, 100, -0.9122268], [-100, -100, -1.8012227, 100, 100, -1.8012227]], sizes=[ [1.92, 4.75, 1.71], # car [2.84, 10.24, 3.44], # truck [2.92, 12.70, 3.42], # bus [2.42, 6.52, 2.34], # emergency vehicle [2.75, 8.17, 3.20], # other vehicle [0.96, 2.35, 1.59], # motorcycle [0.63, 1.76, 1.44], # bicycle [0.76, 0.80, 1.76], # pedestrian [0.35, 0.73, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_secfpn_waymo.py', '../_base_/datasets/waymoD5-3d-3class.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # data settings data = dict(train=dict(dataset=dict(load_interval=1))) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_secfpn_waymo.py', '../_base_/datasets/waymoD5-3d-car.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # data settings data = dict(train=dict(dataset=dict(load_interval=1))) # model settings model = dict( type='MVXFasterRCNN', pts_bbox_head=dict( type='Anchor3DHead', num_classes=1, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]], sizes=[[2.08, 4.73, 1.77]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings train_cfg=dict( _delete_=True, pts=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False))) ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_secfpn_waymo.py', '../_base_/datasets/waymoD5-3d-3class.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] ================================================ FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_secfpn_waymo.py', '../_base_/datasets/waymoD5-3d-car.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( type='MVXFasterRCNN', pts_bbox_head=dict( type='Anchor3DHead', num_classes=1, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]], sizes=[[2.08, 4.73, 1.77]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings train_cfg=dict( _delete_=True, pts=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False))) ================================================ FILE: configs/regnet/README.md ================================================ # Designing Network Design Spaces ## Introduction [BACKBONE] We implement RegNetX models in 3D detection systems and provide their first results with PointPillars on nuScenes dataset. The pre-trained models are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv). ``` @article{radosavovic2020designing, title={Designing Network Design Spaces}, author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, year={2020}, eprint={2003.13678}, archivePrefix={arXiv}, primaryClass={cs.CV} } ``` ## Usage To use a regnet model, there are two steps to do: 1. Convert the model to ResNet-style supported by MMDetection 2. Modify backbone and neck in config accordingly ### Convert model We already prepare models of FLOPs from 800M to 12G in our model zoo. For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to ResNet-style checkpoints used in MMDetection. ```bash python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} ``` This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. ### Modify config The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend). This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level. For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves. **Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model. ## Results ### nuScenes | Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)| |[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py)| 2x |16.4||41.2|55.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json)| |[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|17.1||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)| |[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|17.3||44.8|56.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)| |[RegNetX-1.6gF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|24.0||48.2|59.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311.log.json)| ### Lyft | Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4|| |[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_lyft-3d.py)| 2x |||||| |[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||14.0|14.2|| |[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||15.5|15.6|| ================================================ FILE: configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_1.6gf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch='regnetx_1.6gf', out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[168, 408, 912])) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_lyft.py', '../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_range100_lyft.py', '../_base_/datasets/range100_lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py ================================================ _base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py' # model settings model = dict( pts_neck=dict( type='SECONDFPN', _delete_=True, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 160, 384], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024], [-80, -80, -0.3033737, 80, 80, -0.3033737], [-80, -80, -0.3519405, 80, 80, -0.3519405], [-80, -80, -0.8871424, 80, 80, -0.8871424], [-80, -80, -0.6276341, 80, 80, -0.6276341], [-80, -80, -1.3220503, 80, 80, -1.3220503], [-80, -80, -1.0709302, 80, 80, -1.0709302], [-80, -80, -0.9122268, 80, 80, -0.9122268], [-80, -80, -1.8012227, 80, 80, -1.8012227]], sizes=[ [1.92, 4.75, 1.71], # car [2.84, 10.24, 3.44], # truck [2.92, 12.70, 3.42], # bus [2.42, 6.52, 2.34], # emergency vehicle [2.75, 8.17, 3.20], # other vehicle [0.96, 2.35, 1.59], # motorcycle [0.63, 1.76, 1.44], # bicycle [0.76, 0.80, 1.76], # pedestrian [0.35, 0.73, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py ================================================ _base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py' # model settings model = dict( pts_neck=dict( type='SECONDFPN', _delete_=True, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 160, 384], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[ [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795], [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365], [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504], [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111], [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072], [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986], [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965], ], sizes=[ [1.95017717, 4.60718145, 1.72270761], # car [2.4560939, 6.73778078, 2.73004906], # truck [2.87427237, 12.01320693, 3.81509561], # trailer [0.60058911, 1.68452161, 1.27192197], # bicycle [0.66344886, 0.7256437, 1.75748069], # pedestrian [0.39694519, 0.40359262, 1.06232151], # traffic_cone [2.49008838, 0.48578221, 0.98297065], # barrier ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py ================================================ _base_ = \ './hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py' # model settings model = dict( pts_neck=dict( type='SECONDFPN', _delete_=True, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 160, 384], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', in_channels=384, feat_channels=384, anchor_generator=dict( _delete_=True, type='AlignedAnchor3DRangeGenerator', ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024], [-100, -100, -0.3033737, 100, 100, -0.3033737], [-100, -100, -0.3519405, 100, 100, -0.3519405], [-100, -100, -0.8871424, 100, 100, -0.8871424], [-100, -100, -0.6276341, 100, 100, -0.6276341], [-100, -100, -1.3220503, 100, 100, -1.3220503], [-100, -100, -1.0709302, 100, 100, -1.0709302], [-100, -100, -0.9122268, 100, 100, -0.9122268], [-100, -100, -1.8012227, 100, 100, -1.8012227]], sizes=[ [1.92, 4.75, 1.71], # car [2.84, 10.24, 3.44], # truck [2.92, 12.70, 3.42], # bus [2.42, 6.52, 2.34], # emergency vehicle [2.75, 8.17, 3.20], # other vehicle [0.96, 2.35, 1.59], # motorcycle [0.63, 1.76, 1.44], # bicycle [0.76, 0.80, 1.76], # pedestrian [0.35, 0.73, 0.50] # animal ], rotations=[0, 1.57], reshape_out=True))) ================================================ FILE: configs/second/README.md ================================================ # Second: Sparsely embedded convolutional detection ## Introduction [ALGORITHM] We implement SECOND and provide the results and checkpoints on KITTI dataset. ``` @article{yan2018second, title={Second: Sparsely embedded convolutional detection}, author={Yan, Yan and Mao, Yuxing and Li, Bo}, journal={Sensors}, year={2018}, publisher={Multidisciplinary Digital Publishing Institute} } ``` ## Results ### KITTI | Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4||79.07|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238.log.json)| | [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4||64.41|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238.log.json)| ### Waymo | Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download | | :-------: | :-----------: |:-----:| :------:| :------: | :------------: | :----: | :-----: | :-----: | :-----: | :------: | | [SECFPN](./hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py)|5|3 Class|2x|8.12||65.3|61.7|58.9|55.7|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class_20201115_112448.log.json)| | above @ Car|||2x|8.12||67.1|66.6|58.7|58.2| | | above @ Pedestrian|||2x|8.12||68.1|59.1|59.5|51.5| | | above @ Cyclist|||2x|8.12||60.7|59.5|58.4|57.3| | Note: See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation. ================================================ FILE: configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py ================================================ _base_ = [ '../_base_/models/hv_second_secfpn_kitti.py', '../_base_/datasets/kitti-3d-3class.py', '../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py' ] ================================================ FILE: configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py ================================================ _base_ = [ '../_base_/models/hv_second_secfpn_kitti.py', '../_base_/datasets/kitti-3d-car.py', '../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py' ] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( bbox_head=dict( type='Anchor3DHead', num_classes=1, anchor_generator=dict( _delete_=True, type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=True)), # model training and testing settings train_cfg=dict( _delete_=True, assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), allowed_border=0, pos_weight=-1, debug=False)) ================================================ FILE: configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py ================================================ _base_ = [ '../_base_/models/hv_second_secfpn_waymo.py', '../_base_/datasets/waymoD5-3d-3class.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' class_names = ['Car', 'Pedestrian', 'Cyclist'] point_cloud_range = [-76.8, -51.2, -2, 76.8, 51.2, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), points_loader=dict( type='LoadPointsFromFile', load_dim=5, use_dim=[0, 1, 2, 3, 4])) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) ================================================ FILE: configs/sparsefusion_nusc_voxel_LC_SwinT.py ================================================ point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.075, 0.075, 0.2] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset_ViewInfo' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) img_scale = (800, 448) num_views = 6 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='MyLoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_centers=True, with_cam_bbox=True, with_visible=True), dict(type='LoadMultiViewImageFromFiles'), dict( type='OurGlobalRotScaleTrans', rot_range=[-0.3925 * 2, 0.3925 * 2], scale_ratio_range=[0.9, 1.1], translation_std=[0.5, 0.5, 0.5], ), dict( type='OurRandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), # dict(type='PhotoMetricDistortionMultiViewImage', swap_channel=False), # color augmentation cannot improve the performance dict(type='OurRandomAffine', scaling_ratio_range=(0.9, 1.1), flip_ratio=0.5, flip_sync_3d=True), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='SparseDepth', scale_factors=[4], exp_time=0), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='OurObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels', 'gt_pts_centers_view', 'gt_img_centers_view', 'gt_bboxes_cam_view', 'gt_bboxes_lidar_view', 'sparse_depth', 'gt_visible_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadMultiViewImageFromFiles'), dict( type='MultiScaleFlipAug3D', img_scale=img_scale, pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='SparseDepth', scale_factors=[4]), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img', 'sparse_depth']) ]) ] # our default setting uses 4 GPUs with 3 samples per-GPU, please ensure the LR consistent with your batch size data = dict( samples_per_gpu=3, workers_per_gpu=4, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_train.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='SparseFusionDetector', freeze_img=False, img_backbone=dict( type='SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False, convert_weights=True, ), img_neck=dict( type='FPN', in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(120000, 160000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardSimpleVFE', num_features=5, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1440, 1440], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='SparseFusionHead2D_Deform', num_views=num_views, in_channels_img=256, out_size_factor_img=4, in_channels=256 * 2, hidden_channel=128, num_heads=8, num_classes=len(class_names), ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', img_reg_bn=False, img_reg_layer=3, common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), num_proposals=200, # query number in the LiDAR branch num_img_proposals=200, # query number in the camera branch level_num=4, num_pts_decoder_layers=1, # number of transformer layers in the point detector (if you set it >1, ensure it is consistent with your pretrained LiDAR-only model or set "freeze_lidar_detector=False") num_img_decoder_layers=1, # number of transformer layers in the image detector num_fusion_decoder_layers=1, # number of the transformer layers in the fusion stage initialize_by_heatmap=True, # initialize the queries based on the heatmap (we never set it as False) semantic_transfer=True, # whether to use semantic transfer (camera to LiDAR) cross_only=True, # if false, output heatmap would be the average of semantic transfer and the LiDAR-only heatmap of TransFusion-L cross_heatmap_layer=1, nms_kernel_size=3, # suppress nearby proposals when initializing queries for the LiDAR branch geometric_transfer=True, # whether to use geometric transfer depth_input_channel=2, # channel number of depth features. Do not change it unless you modify the SparseDepth class in "mmdet3d/datasets/pipelines/loading.py" img_heatmap_layer=2, img_nms_kernel_size=3, # suppress nearby proposals when initializing queries for the camera branch view_transform=True, # whether to transform the coordinate for the output bboxes of the camera branch use_camera='se', # "se" or None: whether to encode the camera parameters in the view transformation bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), bbox_2d_coder=dict( type='CameraBBoxCoder', code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1), loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1), loss_center_2d=dict(type='L1Loss', reduction='mean', loss_weight=5.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), assigner_2d=dict( type='HungarianAssignerCameraBox', iou_calculator=dict(type='BboxOverlaps3D', coordinate='camera'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25), ), pos_weight=-1, gaussian_overlap=0.1, gaussian_overlap_2d=0.1, min_radius=2, max_radius=999, grid_size=[1440, 1440, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], img_code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[1440, 1440, 40], img_scale=img_scale, out_size_factor=out_size_factor, pc_range=point_cloud_range, voxel_size=voxel_size, nms_type='circle', ))) optimizer = dict( type='AdamW', lr=0.000075, weight_decay=0.01, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1, decay_mult=5), 'img_neck': dict(lr_mult=0.1), 'pts_voxel_layer': dict(lr_mult=0.1), 'pts_voxel_encoder': dict(lr_mult=0.1), 'pts_middle_encoder': dict(lr_mult=0.1), 'pts_backbone': dict(lr_mult=0.1), 'pts_neck': dict(lr_mult=0.1), 'pts_bbox_head.point_transformer': dict(lr_mult=0.1), 'pts_bbox_head.class_encoding': dict(lr_mult=0.1), 'pts_bbox_head.heatmap_head': dict(lr_mult=0.1), 'pts_bbox_head.shared_conv': dict(lr_mult=0.1), 'absolute_pos_embed': dict(decay_mult=0.), 'relative_position_bias_table': dict(decay_mult=0.), 'norm': dict(decay_mult=0.) }), ) # for 4gpu * 3sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(8, 0.0001), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 6 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = 'checkpoints/sparsefusion_voxel0075_SwinT_initial.pth' resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) freeze_lidar_components = True # freeze the LiDAR backbone freeze_lidar_detector = True # freeze the LiDAR detector find_unused_parameters = True # Evaluating bboxes of pts_bbox # mAP: 0.7102 # mATE: 0.2778 # mASE: 0.2477 # mAOE: 0.2701 # mAVE: 0.2529 # mAAE: 0.1881 # NDS: 0.7314 # Eval time: 133.6s # # Per-class results: # Object Class AP ATE ASE AOE AVE AAE # car 0.883 0.171 0.147 0.067 0.263 0.184 # truck 0.651 0.306 0.176 0.078 0.230 0.216 # bus 0.777 0.306 0.178 0.043 0.396 0.256 # trailer 0.453 0.527 0.211 0.466 0.184 0.164 # construction_vehicle 0.308 0.686 0.420 0.857 0.124 0.316 # pedestrian 0.897 0.128 0.280 0.328 0.215 0.099 # motorcycle 0.823 0.188 0.236 0.216 0.421 0.254 # bicycle 0.727 0.164 0.262 0.314 0.189 0.016 # traffic_cone 0.803 0.118 0.298 nan nan nan # barrier 0.779 0.185 0.269 0.060 nan nan ================================================ FILE: configs/sparsefusion_nusc_voxel_LC_r50.py ================================================ point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.075, 0.075, 0.2] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset_ViewInfo' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) img_scale = (800, 448) num_views = 6 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='MyLoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_centers=True, with_cam_bbox=True, with_visible=True), dict(type='LoadMultiViewImageFromFiles'), dict( type='OurGlobalRotScaleTrans', rot_range=[-0.3925 * 2, 0.3925 * 2], scale_ratio_range=[0.9, 1.1], translation_std=[0.5, 0.5, 0.5], ), dict( type='OurRandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), # dict(type='PhotoMetricDistortionMultiViewImage', swap_channel=False), # color augmentation cannot improve the performance dict(type='OurRandomAffine', scaling_ratio_range=(0.9, 1.1), flip_ratio=0.5, flip_sync_3d=True), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='SparseDepth', scale_factors=[4], exp_time=0), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='OurObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels', 'gt_pts_centers_view', 'gt_img_centers_view', 'gt_bboxes_cam_view', 'gt_bboxes_lidar_view', 'sparse_depth', 'gt_visible_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadMultiViewImageFromFiles'), dict( type='MultiScaleFlipAug3D', img_scale=img_scale, pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='SparseDepth', scale_factors=[4]), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img', 'sparse_depth']) ]) ] # our default setting uses 4 GPUs with 4 samples per-GPU, please ensure the LR consistent with your batch size data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_train.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_w_views_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='SparseFusionDetector', freeze_img=False, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, in_channels=3, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', ), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(120000, 160000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardSimpleVFE', num_features=5, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1440, 1440], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='SparseFusionHead2D_Deform', num_views=num_views, in_channels_img=256, out_size_factor_img=4, in_channels=256 * 2, hidden_channel=128, num_heads=8, num_classes=len(class_names), ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', img_reg_bn=False, img_reg_layer=3, common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), num_proposals=200, # query number in the LiDAR branch num_img_proposals=200, # query number in the camera branch level_num=4, num_pts_decoder_layers=1, # number of transformer layers in the point detector (if you set it >1, ensure it is consistent with your pretrained LiDAR-only model or set "freeze_lidar_detector=False") num_img_decoder_layers=1, # number of transformer layers in the image detector num_fusion_decoder_layers=1, # number of the transformer layers in the fusion stage initialize_by_heatmap=True, # initialize the queries based on the heatmap (we never set it as False) semantic_transfer=True, # whether to use semantic transfer (camera to LiDAR) cross_only=True, # if false, output heatmap would be the average of semantic transfer and the LiDAR-only heatmap of TransFusion-L cross_heatmap_layer=1, nms_kernel_size=3, # suppress nearby proposals when initializing queries for the LiDAR branch geometric_transfer=True, # whether to use geometric transfer depth_input_channel=2, # channel number of depth features. Do not change it unless you modify the SparseDepth class in "mmdet3d/datasets/pipelines/loading.py" img_heatmap_layer=2, img_nms_kernel_size=3, # suppress nearby proposals when initializing queries for the camera branch view_transform=True, # whether to transform the coordinate for the output bboxes of the camera branch use_camera='se', # "se" or None: whether to encode the camera parameters in the view transformation bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), bbox_2d_coder=dict( type='CameraBBoxCoder', code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1), loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1), loss_center_2d=dict(type='L1Loss', reduction='mean', loss_weight=5.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), assigner_2d=dict( type='HungarianAssignerCameraBox', iou_calculator=dict(type='BboxOverlaps3D', coordinate='camera'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25), ), pos_weight=-1, gaussian_overlap=0.1, gaussian_overlap_2d=0.1, min_radius=2, max_radius=999, grid_size=[1440, 1440, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], img_code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[1440, 1440, 40], img_scale=img_scale, out_size_factor=out_size_factor, pc_range=point_cloud_range, voxel_size=voxel_size, nms_type='circle', ))) optimizer = dict( type='AdamW', lr=0.0001, weight_decay=0.01, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), 'img_neck': dict(lr_mult=0.1), 'pts_voxel_layer': dict(lr_mult=0.1), 'pts_voxel_encoder': dict(lr_mult=0.1), 'pts_middle_encoder': dict(lr_mult=0.1), 'pts_backbone': dict(lr_mult=0.1), 'pts_neck': dict(lr_mult=0.1), 'pts_bbox_head.point_transformer': dict(lr_mult=0.1), 'pts_bbox_head.class_encoding': dict(lr_mult=0.1), 'pts_bbox_head.heatmap_head': dict(lr_mult=0.1), 'pts_bbox_head.shared_conv': dict(lr_mult=0.1), }), ) # for 4gpu * 4sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(8, 0.0001), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 6 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = 'checkpoints/sparsefusion_voxel0075_R50_initial.pth' resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) freeze_lidar_components = True # freeze the LiDAR backbone freeze_lidar_detector = True # freeze the LiDAR detector find_unused_parameters = True # Evaluating bboxes of pts_bbox # mAP: 0.7051 # mATE: 0.2757 # mASE: 0.2506 # mAOE: 0.2767 # mAVE: 0.2562 # mAAE: 0.1869 # NDS: 0.7279 # Eval time: 137.2s # # Per-class results: # Object Class AP ATE ASE AOE AVE AAE # car 0.883 0.171 0.146 0.066 0.262 0.187 # truck 0.643 0.305 0.177 0.071 0.235 0.211 # bus 0.775 0.304 0.177 0.044 0.411 0.250 # trailer 0.447 0.522 0.214 0.432 0.179 0.159 # construction_vehicle 0.303 0.669 0.424 0.842 0.127 0.326 # pedestrian 0.898 0.127 0.282 0.329 0.216 0.104 # motorcycle 0.810 0.189 0.241 0.215 0.426 0.249 # bicycle 0.712 0.164 0.263 0.422 0.193 0.010 # traffic_cone 0.808 0.118 0.309 nan nan nan # barrier 0.772 0.188 0.273 0.068 nan nan ================================================ FILE: configs/ssn/README.md ================================================ # SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds ## Introduction [ALGORITHM] We implement PointPillars with Shape-aware grouping heads used in the SSN and provide the results and checkpoints on the nuScenes and Lyft dataset. ``` @inproceedings{zhu2020ssn, title={SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds}, author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua}, booktitle={Proceedings of the European Conference on Computer Vision}, year={2020} } ``` ## Results ### NuScenes | Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | NDS | Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)| |[SSN](./hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py)|2x|9.62||41.56|54.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20201023_193737-5fda3f00.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20201023_193737.log.json)| [RegNetX-400MF-SECFPN](../regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||41.15|55.20|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json)| |[RegNetX-400MF-SSN](./hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py)|2x|10.26||46.95|58.24|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20201024_232447-7af3d8c8.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20201024_232447.log.json)| ### Lyft | Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | |[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4|| |[SSN](./hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py)|2x|8.30||17.4|17.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20201016_220844-3058d9fc.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20201016_220844.log.json)| |[RegNetX-400MF-SSN](./hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py)|2x|9.98||18.1|18.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d_20201025_213155-4532096c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d_20201025_213155.log.json)| Note: The main difference of the shape-aware grouping heads with the original SECOND FPN heads is that the former groups objects with similar sizes and shapes together, and design shape-specific heads for each group. Heavier heads (with more convolutions and large strides) are designed for large objects while smaller heads for small objects. Note that there may appear different feature map sizes in the outputs, so an anchor generator tailored to these feature maps is also needed in the implementation. Users could try other settings in terms of the head design. Here we basically refer to the implementation [HERE](https://github.com/xinge008/SSN). ================================================ FILE: configs/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py ================================================ _base_ = './hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py' # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) # dataset settings data = dict(samples_per_gpu=1, workers_per_gpu=2) ================================================ FILE: configs/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py ================================================ _base_ = './hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py' # model settings model = dict( type='MVXFasterRCNN', pretrained=dict(pts='open-mmlab://regnetx_400mf'), pts_backbone=dict( _delete_=True, type='NoStemRegNet', arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), out_indices=(1, 2, 3), frozen_stages=-1, strides=(1, 2, 2, 2), base_channels=64, stem_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), norm_eval=False, style='pytorch'), pts_neck=dict(in_channels=[64, 160, 384])) ================================================ FILE: configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_lyft.py', '../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] point_cloud_range = [-100, -100, -5, 100, 100, 3] # Note that the order of class names should be consistent with # the following anchors' order class_names = [ 'bicycle', 'motorcycle', 'pedestrian', 'animal', 'car', 'emergency_vehicle', 'bus', 'other_vehicle', 'truck' ] train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), dict(type='LoadPointsFromMultiSweeps', sweeps_num=10), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), dict(type='LoadPointsFromMultiSweeps', sweeps_num=10), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict(pipeline=train_pipeline, classes=class_names), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) # model settings model = dict( pts_voxel_layer=dict(point_cloud_range=[-100, -100, -5, 100, 100, 3]), pts_voxel_encoder=dict( feat_channels=[32, 64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), pts_middle_encoder=dict(output_shape=[800, 800]), pts_neck=dict( _delete_=True, type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( _delete_=True, type='ShapeAwareHead', num_classes=9, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGeneratorPerCls', ranges=[[-100, -100, -1.0709302, 100, 100, -1.0709302], [-100, -100, -1.3220503, 100, 100, -1.3220503], [-100, -100, -0.9122268, 100, 100, -0.9122268], [-100, -100, -1.8012227, 100, 100, -1.8012227], [-100, -100, -1.0715024, 100, 100, -1.0715024], [-100, -100, -0.8871424, 100, 100, -0.8871424], [-100, -100, -0.3519405, 100, 100, -0.3519405], [-100, -100, -0.6276341, 100, 100, -0.6276341], [-100, -100, -0.3033737, 100, 100, -0.3033737]], sizes=[ [0.63, 1.76, 1.44], # bicycle [0.96, 2.35, 1.59], # motorcycle [0.76, 0.80, 1.76], # pedestrian [0.35, 0.73, 0.50], # animal [1.92, 4.75, 1.71], # car [2.42, 6.52, 2.34], # emergency vehicle [2.92, 12.70, 3.42], # bus [2.75, 8.17, 3.20], # other vehicle [2.84, 10.24, 3.44] # truck ], custom_values=[], rotations=[0, 1.57], reshape_out=False), tasks=[ dict( num_class=2, class_names=['bicycle', 'motorcycle'], shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=2, class_names=['pedestrian', 'animal'], shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=2, class_names=['car', 'emergency_vehicle'], shared_conv_channels=(64, 64, 64), shared_conv_strides=(2, 1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=3, class_names=['bus', 'other_vehicle', 'truck'], shared_conv_channels=(64, 64, 64), shared_conv_strides=(2, 1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)) ], assign_per_class=True, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2))) # model training and testing settings train_cfg = dict( _delete_=True, pts=dict( assigner=[ dict( # bicycle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # motorcycle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # animal type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), dict( # emergency vehicle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # bus type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), dict( # other vehicle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # truck type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1) ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False)) ================================================ FILE: configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py ================================================ _base_ = [ '../_base_/models/hv_pointpillars_fpn_nus.py', '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py', ] # Note that the order of class names should be consistent with # the following anchors' order point_cloud_range = [-50, -50, -5, 50, 50, 3] class_names = [ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier', 'car', 'truck', 'trailer', 'bus', 'construction_vehicle' ] train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), dict(type='LoadPointsFromMultiSweeps', sweeps_num=10), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), dict(type='LoadPointsFromMultiSweeps', sweeps_num=10), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict(pipeline=train_pipeline, classes=class_names), val=dict(pipeline=test_pipeline, classes=class_names), test=dict(pipeline=test_pipeline, classes=class_names)) # model settings model = dict( pts_voxel_layer=dict(max_num_points=20), pts_voxel_encoder=dict(feat_channels=[64, 64]), pts_neck=dict( _delete_=True, type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( _delete_=True, type='ShapeAwareHead', num_classes=10, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGeneratorPerCls', ranges=[[-50, -50, -1.67339111, 50, 50, -1.67339111], [-50, -50, -1.71396371, 50, 50, -1.71396371], [-50, -50, -1.61785072, 50, 50, -1.61785072], [-50, -50, -1.80984986, 50, 50, -1.80984986], [-50, -50, -1.76396500, 50, 50, -1.76396500], [-50, -50, -1.80032795, 50, 50, -1.80032795], [-50, -50, -1.74440365, 50, 50, -1.74440365], [-50, -50, -1.68526504, 50, 50, -1.68526504], [-50, -50, -1.80673031, 50, 50, -1.80673031], [-50, -50, -1.64824291, 50, 50, -1.64824291]], sizes=[ [0.60058911, 1.68452161, 1.27192197], # bicycle [0.76279481, 2.09973778, 1.44403034], # motorcycle [0.66344886, 0.72564370, 1.75748069], # pedestrian [0.39694519, 0.40359262, 1.06232151], # traffic cone [2.49008838, 0.48578221, 0.98297065], # barrier [1.95017717, 4.60718145, 1.72270761], # car [2.45609390, 6.73778078, 2.73004906], # truck [2.87427237, 12.01320693, 3.81509561], # trailer [2.94046906, 11.1885991, 3.47030982], # bus [2.73050468, 6.38352896, 3.13312415] # construction vehicle ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=False), tasks=[ dict( num_class=2, class_names=['bicycle', 'motorcycle'], shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=1, class_names=['pedestrian'], shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=2, class_names=['traffic_cone', 'barrier'], shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=1, class_names=['car'], shared_conv_channels=(64, 64, 64), shared_conv_strides=(2, 1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)), dict( num_class=4, class_names=[ 'truck', 'trailer', 'bus', 'construction_vehicle' ], shared_conv_channels=(64, 64, 64), shared_conv_strides=(2, 1, 1), norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)) ], assign_per_class=True, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( _delete_=True, pts=dict( assigner=[ dict( # bicycle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # motorcycle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # traffic cone type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # barrier type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), dict( # truck type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # trailer type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # bus type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # construction vehicle type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1) ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], pos_weight=-1, debug=False))) ================================================ FILE: configs/transfusion_nusc_pillar_L.py ================================================ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.2, 0.2, 8] out_size_factor = 4 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='ObjectSample', db_sampler=dict( data_root=None, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ))), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925 * 2, 0.3925 * 2], scale_ratio_range=[0.9, 1.1], translation_std=[0.5, 0.5, 0.5]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_train.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', pts_voxel_layer=dict( max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 60000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='PillarFeatureNet', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=voxel_size, norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), point_cloud_range=point_cloud_range, ), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=(512, 512) ), pts_backbone=dict( type='SECOND', in_channels=64, out_channels=[64, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], out_channels=[128, 128, 128], upsample_strides=[0.5, 1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=128 * 3, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[512, 512, 1], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[512, 512, 1], out_size_factor=out_size_factor, pc_range=point_cloud_range[0:2], voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 20 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) ================================================ FILE: configs/transfusion_nusc_pillar_LC.py ================================================ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.2, 0.2, 8] out_size_factor = 4 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) img_scale = (800, 448) num_views = 6 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='LoadMultiViewImageFromFiles'), # dict( # type='GlobalRotScaleTrans', # rot_range=[-0.3925 * 2, 0.3925 * 2], # scale_ratio_range=[0.9, 1.1], # translation_std=[0.5, 0.5, 0.5]), # dict( # type='RandomFlip3D', # sync_2d=True, # flip_ratio_bev_horizontal=0.5, # flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadMultiViewImageFromFiles'), dict( type='MultiScaleFlipAug3D', img_scale=img_scale, pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_train.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', freeze_img=True, # img_backbone=dict( # type='DLASeg', # num_layers=34, # heads={}, # head_convs=-1, # ), img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 60000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='PillarFeatureNet', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=voxel_size, norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), point_cloud_range=point_cloud_range, ), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=(512, 512) ), pts_backbone=dict( type='SECOND', in_channels=64, out_channels=[64, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], out_channels=[128, 128, 128], upsample_strides=[0.5, 1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', fuse_img=True, num_views=num_views, in_channels_img=256, out_size_factor_img=4, num_proposals=200, auxiliary=True, in_channels=128 * 3, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[512, 512, 1], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[512, 512, 1], out_size_factor=out_size_factor, pc_range=point_cloud_range[0:2], voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 6 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = 'checkpoints/fusion_pillar02_R50.pth' resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) freeze_lidar_components = True find_unused_parameters = True ================================================ FILE: configs/transfusion_nusc_voxel_L.py ================================================ point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.075, 0.075, 0.2] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='ObjectSample', db_sampler=dict( data_root=None, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ))), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925 * 2, 0.3925 * 2], scale_ratio_range=[0.9, 1.1], translation_std=[0.5, 0.5, 0.5]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_train.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(120000, 160000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardSimpleVFE', num_features=5, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1440, 1440], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=256 * 2, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[1440, 1440, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[1440, 1440, 40], out_size_factor=out_size_factor, pc_range=point_cloud_range[0:2], voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 20 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) ================================================ FILE: configs/transfusion_nusc_voxel_LC.py ================================================ point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] voxel_size = [0.075, 0.075, 0.2] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) img_scale = (800, 448) num_views = 6 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='LoadMultiViewImageFromFiles'), # dict( # type='GlobalRotScaleTrans', # rot_range=[-0.3925 * 2, 0.3925 * 2], # scale_ratio_range=[0.9, 1.1], # translation_std=[0.5, 0.5, 0.5]), # dict( # type='RandomFlip3D', # sync_2d=True, # flip_ratio_bev_horizontal=0.5, # flip_ratio_bev_vertical=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], ), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], ), dict(type='LoadMultiViewImageFromFiles'), dict( type='MultiScaleFlipAug3D', img_scale=img_scale, pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='CBGSDataset', dataset=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_train_20pc.pkl', load_interval=1, pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/nuscenes_infos_val.pkl', load_interval=1, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', freeze_img=True, # img_backbone=dict( # type='DLASeg', # num_layers=34, # heads={}, # head_convs=-1, # ), img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(120000, 160000), point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardSimpleVFE', num_features=5, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1440, 1440], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', fuse_img=True, num_views=num_views, in_channels_img=256, out_size_factor_img=4, num_proposals=200, auxiliary=True, in_channels=256 * 2, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, code_size=10, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='nuScenes', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[1440, 1440, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='nuScenes', grid_size=[1440, 1440, 40], out_size_factor=out_size_factor, pc_range=point_cloud_range[0:2], voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 6 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = 'checkpoints/fusion_voxel0075_R50.pth' resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) freeze_lidar_components = True find_unused_parameters = True ================================================ FILE: configs/transfusion_waymo_voxel_L.py ================================================ point_cloud_range = [-75.2, -75.2, -2, 75.2, 75.2, 4] class_names = ['Car', 'Pedestrian', 'Cyclist'] voxel_size = [0.1, 0.1, 0.15] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format' input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), # dict(type='ObjectSample', # db_sampler=dict( # data_root=data_root, # info_path=data_root + '/waymo_dbinfos_train.pkl', # rate=1.0, # prepare=dict( # filter_by_difficulty=[-1], # filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), # classes=class_names, # sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), # points_loader=dict( # type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4])) # ), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], ), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict( type='MultiScaleFlipAug3D', img_scale=(800, 1333), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=6, train=dict( type='RepeatDataset', times=1, dataset=dict( type=dataset_type, data_root=data_root, load_interval=1, ann_file=data_root + '/waymo_infos_train.pkl', split='training', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/waymo_infos_val.pkl', split='training', load_interval=10, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + '/waymo_infos_val.pkl', split='training', load_interval=10, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', pts_voxel_layer=dict( max_num_points=5, voxel_size=voxel_size, max_voxels=150000, point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardVFE', in_channels=5, # num_features=5, feat_channels=[64], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=voxel_size, norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), point_cloud_range=point_cloud_range, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=64, sparse_shape=[41, 1504, 1504], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', num_proposals=300, auxiliary=True, in_channels=256 * 2, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-80, -80, -10.0, 80, 80, 10.0], score_threshold=0.0, code_size=8, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=2.0), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='Waymo', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.6), reg_cost=dict(type='BBoxBEVL1Cost', weight=2.0), iou_cost=dict(type='IoU3DCost', weight=2.0) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[1504, 1504, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='Waymo', grid_size=[1504, 1504, 40], out_size_factor=out_size_factor, voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 4sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 36 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] gpu_ids = range(0, 8) ================================================ FILE: configs/transfusion_waymo_voxel_LC.py ================================================ point_cloud_range = [-75.2, -75.2, -2, 75.2, 75.2, 4] class_names = ['Car', 'Pedestrian', 'Cyclist'] voxel_size = [0.1, 0.1, 0.15] out_size_factor = 8 evaluation = dict(interval=1) dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format' input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) img_scale = (640, 960) num_views = 5 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='LoadMultiViewImageFromFiles', img_scale=(1280, 1920)), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyPad', size_divisor=32), # dict( # type='RandomFlip3D', # sync_2d=True, # flip_ratio_bev_horizontal=0.5, # flip_ratio_bev_vertical=0.5), # dict( # type='GlobalRotScaleTrans', # rot_range=[-0.78539816, 0.78539816], # scale_ratio_range=[0.95, 1.05], # ), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5), dict(type='LoadMultiViewImageFromFiles', img_scale=(1280, 1920)), dict( type='MultiScaleFlipAug3D', img_scale=img_scale, pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict(type='MyNormalize', **img_norm_cfg), dict(type='MyResize', img_scale=img_scale, keep_ratio=True), dict(type='MyPad', size_divisor=32), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='RepeatDataset', times=1, dataset=dict( type=dataset_type, data_root=data_root, load_interval=1, num_views=num_views, ann_file=data_root + '/waymo_infos_train.pkl', split='training', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/waymo_infos_val.pkl', split='training', load_interval=10, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, num_views=num_views, ann_file=data_root + '/waymo_infos_val.pkl', split='training', load_interval=10, pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) model = dict( type='TransFusionDetector', freeze_img=True, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), pts_voxel_layer=dict( max_num_points=5, voxel_size=voxel_size, max_voxels=150000, point_cloud_range=point_cloud_range), pts_voxel_encoder=dict( type='HardVFE', in_channels=5, feat_channels=[64], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=voxel_size, norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), point_cloud_range=point_cloud_range, ), pts_middle_encoder=dict( type='SparseEncoder', in_channels=64, sparse_shape=[41, 1504, 1504], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='TransFusionHead', fuse_img=True, num_views=num_views, in_channels_img=256, out_size_factor_img=4, num_proposals=300, auxiliary=True, in_channels=256 * 2, hidden_channel=128, num_classes=len(class_names), num_decoder_layers=1, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=True, nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=point_cloud_range[:2], voxel_size=voxel_size[:2], out_size_factor=out_size_factor, post_center_range=[-80, -80, -10.0, 80, 80, 10.0], score_threshold=0.0, code_size=8, ), loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0), # loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=2.0), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), ), train_cfg=dict( pts=dict( dataset='Waymo', assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.6), reg_cost=dict(type='BBoxBEVL1Cost', weight=2.0), iou_cost=dict(type='IoU3DCost', weight=2.0) ), pos_weight=-1, gaussian_overlap=0.1, min_radius=2, grid_size=[1504, 1504, 40], # [x_len, y_len, 1] voxel_size=voxel_size, out_size_factor=out_size_factor, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], point_cloud_range=point_cloud_range)), test_cfg=dict( pts=dict( dataset='Waymo', pc_range=point_cloud_range[:2], grid_size=[1504, 1504, 40], out_size_factor=out_size_factor, voxel_size=voxel_size[:2], nms_type=None, ))) optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 0.0001), cyclic_times=1, step_ratio_up=0.4) momentum_config = dict( policy='cyclic', target_ratio=(0.8947368421052632, 1), cyclic_times=1, step_ratio_up=0.4) total_epochs = 12 checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = 'checkpoints/waymo_36e_R50.pth' resume_from = None workflow = [('train', 1)] freeze_lidar_components = True find_unused_parameters = True gpu_ids = range(0, 8) ================================================ FILE: configs/votenet/README.md ================================================ # Deep Hough Voting for 3D Object Detection in Point Clouds ## Introduction [ALGORITHM] We implement VoteNet and provide the result and checkpoints on ScanNet and SUNRGBD datasets. ``` @inproceedings{qi2019deep, author = {Qi, Charles R and Litany, Or and He, Kaiming and Guibas, Leonidas J}, title = {Deep Hough Voting for 3D Object Detection in Point Clouds}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, year = {2019} } ``` ## Results ### ScanNet | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./votenet_8x8_scannet-3d-18class.py) | 3x |4.1||62.90|39.91|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238.log.json)| ### SUNRGBD | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./votenet_16x8_sunrgbd-3d-10class.py) | 3x |8.1||59.07|35.77|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20200620_230238-4483c0c0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20200620_230238.log.json)| **Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version < 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_votenet_checkpoints.py](../../tools/model_converters/convert_votenet_checkpoints.py): ``` python ./tools/model_converters/convert_votenet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH} ``` Then you can use the converted checkpoints following [getting_started.md](../../docs/getting_started.md). ## Indeterminism Since test data preparation randomly downsamples the points, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above. ## IoU loss Adding IoU loss (simply = 1-IoU) boosts VoteNet's performance. To use IoU loss, add this loss term to the config file: ```python iou_loss=dict(type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 / 3.0) ``` | Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download | | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | | [PointNet++](./votenet_iouloss_8x8_scannet-3d-18class.py) | 3x |4.1||63.81|44.21|/| For now, we only support calculating IoU loss for axis-aligned bounding boxes since the CUDA op of general 3D IoU calculation does not implement the backward method. Therefore, IoU loss can only be used for ScanNet dataset for now. ================================================ FILE: configs/votenet/votenet_16x8_sunrgbd-3d-10class.py ================================================ _base_ = [ '../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/models/votenet.py', '../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py' ] # model settings model = dict( bbox_head=dict( num_classes=10, bbox_coder=dict( type='PartialBinBasedBBoxCoder', num_sizes=10, num_dir_bins=12, with_rot=True, mean_sizes=[ [2.114256, 1.620300, 0.927272], [0.791118, 1.279516, 0.718182], [0.923508, 1.867419, 0.845495], [0.591958, 0.552978, 0.827272], [0.699104, 0.454178, 0.75625], [0.69519, 1.346299, 0.736364], [0.528526, 1.002642, 1.172878], [0.500618, 0.632163, 0.683424], [0.404671, 1.071108, 1.688889], [0.76584, 1.398258, 0.472728] ]), )) ================================================ FILE: configs/votenet/votenet_8x8_scannet-3d-18class.py ================================================ _base_ = [ '../_base_/datasets/scannet-3d-18class.py', '../_base_/models/votenet.py', '../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py' ] # model settings model = dict( bbox_head=dict( num_classes=18, bbox_coder=dict( type='PartialBinBasedBBoxCoder', num_sizes=18, num_dir_bins=1, with_rot=False, mean_sizes=[[0.76966727, 0.8116021, 0.92573744], [1.876858, 1.8425595, 1.1931566], [0.61328, 0.6148609, 0.7182701], [1.3955007, 1.5121545, 0.83443564], [0.97949594, 1.0675149, 0.6329687], [0.531663, 0.5955577, 1.7500148], [0.9624706, 0.72462326, 1.1481868], [0.83221924, 1.0490936, 1.6875663], [0.21132214, 0.4206159, 0.5372846], [1.4440073, 1.8970833, 0.26985747], [1.0294262, 1.4040797, 0.87554324], [1.3766412, 0.65521795, 1.6813129], [0.6650819, 0.71111923, 1.298853], [0.41999173, 0.37906948, 1.7513971], [0.59359556, 0.5912492, 0.73919016], [0.50867593, 0.50656086, 0.30136237], [1.1511526, 1.0546296, 0.49706793], [0.47535285, 0.49249494, 0.5802117]]))) # optimizer # yapf:disable log_config = dict( interval=30, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable ================================================ FILE: configs/votenet/votenet_iouloss_8x8_scannet-3d-18class.py ================================================ _base_ = ['./votenet_8x8_scannet-3d-18class.py'] # model settings, add iou loss model = dict( bbox_head=dict( iou_loss=dict( type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 / 3.0))) ================================================ FILE: configs/waymo.md ================================================ # MODEL ZOO ## Common settings and notes - The experiments are run with PyTorch 1.7.0, CUDA 10.1 and CUDNN 7.6 - The training is conducted on 8 Tesla V100 GPUs ## Waymo 3D Detection We try a few training schedules for TransFusion-L and list the performance below. The fusion-based models are further trained for 6 epochs from the pretrained LiDAR backbone. We freeze the weight of LiDAR backbone to save GPU memory. | Model | Backbone | epoch | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | |---------|--------|--------|---------|---------|---------|---------| | [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 12 | 63.86 | 62.84 | 67.17 | 64.63 | [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 24 | 64.54 | 63.39 | 66.43 | 64.78 | [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 36 | 65.07 | 63.70 | 65.97 | 64.91 | [TransFusion](configs/transfusion_waymo_voxel_LC.py) | VoxelNet | 36 + 6| 65.11 | 64.02 | 67.40 | 65.51 ================================================ FILE: demo/pcd_demo.py ================================================ from argparse import ArgumentParser from mmdet3d.apis import inference_detector, init_detector, show_result_meshlab def main(): parser = ArgumentParser() parser.add_argument('pcd', help='Point cloud file') parser.add_argument('config', help='Config file') parser.add_argument('checkpoint', help='Checkpoint file') parser.add_argument( '--device', default='cuda:0', help='Device used for inference') parser.add_argument( '--score-thr', type=float, default=0.6, help='bbox score threshold') parser.add_argument( '--out-dir', type=str, default='demo', help='dir to save results') args = parser.parse_args() # build the model from a config file and a checkpoint file model = init_detector(args.config, args.checkpoint, device=args.device) # test a single image result, data = inference_detector(model, args.pcd) # show the results show_result_meshlab(data, result, args.out_dir) if __name__ == '__main__': main() ================================================ FILE: docker/Dockerfile ================================================ ARG PYTORCH="1.6.0" ARG CUDA="10.1" ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX" ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Install MMCV RUN pip install mmcv-full==latest+torch1.6.0+cu101 -f https://openmmlab.oss-accelerate.aliyuncs.com/mmcv/dist/index.html RUN pip install mmdet # Install MMDetection RUN conda clean --all RUN git clone https://github.com/open-mmlab/mmdetection3d.git /mmdetection3d WORKDIR /mmdetection3d ENV FORCE_CUDA="1" RUN pip install -r requirements/build.txt RUN pip install --no-cache-dir -e . # uninstall pycocotools installed by nuscenes-devkit and reinstall mmpycocotools RUN pip uninstall pycocotools --no-cache-dir -y RUN pip install mmpycocotools --no-cache-dir --force --no-deps ================================================ FILE: mmdet3d/__init__.py ================================================ import mmcv import mmdet from .version import __version__, short_version def digit_version(version_str): digit_version = [] for x in version_str.split('.'): if x.isdigit(): digit_version.append(int(x)) elif x.find('rc') != -1: patch_version = x.split('rc') digit_version.append(int(patch_version[0]) - 1) digit_version.append(int(patch_version[1])) return digit_version mmcv_minimum_version = '1.2.4' mmcv_maximum_version = '1.4.0' mmcv_version = digit_version(mmcv.__version__) assert (mmcv_version >= digit_version(mmcv_minimum_version) and mmcv_version <= digit_version(mmcv_maximum_version)), \ f'MMCV=={mmcv.__version__} is used but incompatible. ' \ f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' mmdet_minimum_version = '2.5.0' mmdet_maximum_version = '3.0.0' mmdet_version = digit_version(mmdet.__version__) assert (mmdet_version >= digit_version(mmdet_minimum_version) and mmdet_version <= digit_version(mmdet_maximum_version)), \ f'MMDET=={mmdet.__version__} is used but incompatible. ' \ f'Please install mmdet>={mmdet_minimum_version}, ' \ f'<={mmdet_maximum_version}.' __all__ = ['__version__', 'short_version'] ================================================ FILE: mmdet3d/apis/__init__.py ================================================ from .inference import (convert_SyncBN, inference_detector, init_detector, show_result_meshlab) from .test import single_gpu_test __all__ = [ 'inference_detector', 'init_detector', 'single_gpu_test', 'show_result_meshlab', 'convert_SyncBN' ] ================================================ FILE: mmdet3d/apis/inference.py ================================================ import mmcv import torch from copy import deepcopy from mmcv.parallel import collate, scatter from mmcv.runner import load_checkpoint from os import path as osp from mmdet3d.core import Box3DMode, show_result from mmdet3d.core.bbox import get_box_type from mmdet3d.datasets.pipelines import Compose from mmdet3d.models import build_detector def convert_SyncBN(config): """Convert config's naiveSyncBN to BN. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. """ if isinstance(config, dict): for item in config: if item == 'norm_cfg': config[item]['type'] = config[item]['type']. \ replace('naiveSyncBN', 'BN') else: convert_SyncBN(config[item]) def init_detector(config, checkpoint=None, device='cuda:0'): """Initialize a detector from config file. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. device (str): Device to use. Returns: nn.Module: The constructed detector. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' f'but got {type(config)}') config.model.pretrained = None convert_SyncBN(config.model) config.model.train_cfg = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) if checkpoint is not None: checkpoint = load_checkpoint(model, checkpoint) if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = config.class_names model.cfg = config # save the config in the model for convenience model.to(device) model.eval() return model def inference_detector(model, pcd): """Inference point cloud with the detector. Args: model (nn.Module): The loaded detector. pcd (str): Point cloud files. Returns: tuple: Predicted results and data from pipeline. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d) data = dict( pts_filename=pcd, box_type_3d=box_type_3d, box_mode_3d=box_mode_3d, sweeps=[], # set timestamp = 0 timestamp=[0], img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device.index])[0] else: # this is a workaround to avoid the bug of MMDataParallel data['img_metas'] = data['img_metas'][0].data data['points'] = data['points'][0].data # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result, data def show_result_meshlab(data, result, out_dir): """Show result by meshlab. Args: data (dict): Contain data from pipeline. result (dict): Predicted result from model. out_dir (str): Directory to save visualized result. """ points = data['points'][0][0].cpu().numpy() pts_filename = data['img_metas'][0][0]['pts_filename'] file_name = osp.split(pts_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' if 'pts_bbox' in result[0].keys(): pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy() else: pred_bboxes = result[0]['boxes_3d'].tensor.numpy() # for now we convert points into depth mode if data['img_metas'][0][0]['box_mode_3d'] != Box3DMode.DEPTH: points = points[..., [1, 0, 2]] points[..., 0] *= -1 pred_bboxes = Box3DMode.convert(pred_bboxes, data['img_metas'][0][0]['box_mode_3d'], Box3DMode.DEPTH) show_result(points, None, pred_bboxes, out_dir, file_name, show=False) return out_dir, file_name ================================================ FILE: mmdet3d/apis/test.py ================================================ import mmcv import torch def single_gpu_test(model, data_loader, show=False, out_dir=None): """Test model with single gpu. This method tests model with single gpu and gives the 'show' option. By setting ``show=True``, it saves the visualization results under ``out_dir``. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. show (bool): Whether to save viualization results. Default: True. out_dir (str): The path to save visualization results. Default: None. Returns: list[dict]: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if show: model.module.show_results(data, result, out_dir) results.extend(result) batch_size = len(result) for _ in range(batch_size): prog_bar.update() return results ================================================ FILE: mmdet3d/core/__init__.py ================================================ from .anchor import * # noqa: F401, F403 from .bbox import * # noqa: F401, F403 from .evaluation import * # noqa: F401, F403 from .points import * # noqa: F401, F403 from .post_processing import * # noqa: F401, F403 from .utils import * # noqa: F401, F403 from .visualizer import * # noqa: F401, F403 from .voxel import * # noqa: F401, F403 ================================================ FILE: mmdet3d/core/anchor/__init__.py ================================================ from mmdet.core.anchor import build_anchor_generator from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator, AlignedAnchor3DRangeGeneratorPerCls, Anchor3DRangeGenerator) __all__ = [ 'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator', 'build_anchor_generator', 'AlignedAnchor3DRangeGeneratorPerCls' ] ================================================ FILE: mmdet3d/core/anchor/anchor_3d_generator.py ================================================ import mmcv import torch from mmdet.core.anchor import ANCHOR_GENERATORS @ANCHOR_GENERATORS.register_module() class Anchor3DRangeGenerator(object): """3D Anchor Generator by range. This anchor generator generates anchors by the given range in different feature levels. Due the convention in 3D detection, different anchor sizes are related to different ranges for different categories. However we find this setting does not effect the performance much in some datasets, e.g., nuScenes. Args: ranges (list[list[float]]): Ranges of different anchors. The ranges are the same across different feature levels. But may vary for different anchor sizes if size_per_range is True. sizes (list[list[float]]): 3D sizes of anchors. scales (list[int]): Scales of anchors in different feature levels. rotations (list[float]): Rotations of anchors in a feature grid. custom_values (tuple[float]): Customized values of that anchor. For example, in nuScenes the anchors have velocities. reshape_out (bool): Whether to reshape the output into (N x 4). size_per_range: Whether to use separate ranges for different sizes. If size_per_range is True, the ranges should have the same length as the sizes, if not, it will be duplicated. """ def __init__(self, ranges, sizes=[[1.6, 3.9, 1.56]], scales=[1], rotations=[0, 1.5707963], custom_values=(), reshape_out=True, size_per_range=True): assert mmcv.is_list_of(ranges, list) if size_per_range: if len(sizes) != len(ranges): assert len(ranges) == 1 ranges = ranges * len(sizes) assert len(ranges) == len(sizes) else: assert len(ranges) == 1 assert mmcv.is_list_of(sizes, list) assert isinstance(scales, list) self.sizes = sizes self.scales = scales self.ranges = ranges self.rotations = rotations self.custom_values = custom_values self.cached_anchors = None self.reshape_out = reshape_out self.size_per_range = size_per_range def __repr__(self): s = self.__class__.__name__ + '(' s += f'anchor_range={self.ranges},\n' s += f'scales={self.scales},\n' s += f'sizes={self.sizes},\n' s += f'rotations={self.rotations},\n' s += f'reshape_out={self.reshape_out},\n' s += f'size_per_range={self.size_per_range})' return s @property def num_base_anchors(self): """list[int]: Total number of base anchors in a feature grid.""" num_rot = len(self.rotations) num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0) return num_rot * num_size @property def num_levels(self): """int: Number of feature levels that the generator is applied to.""" return len(self.scales) def grid_anchors(self, featmap_sizes, device='cuda'): """Generate grid anchors in multiple feature levels. Args: featmap_sizes (list[tuple]): List of feature map sizes in multiple feature levels. device (str): Device where the anchors will be put on. Returns: list[torch.Tensor]: Anchors in multiple feature levels. \ The sizes of each tensor should be [N, 4], where \ N = width * height * num_base_anchors, width and height \ are the sizes of the corresponding feature lavel, \ num_base_anchors is the number of anchors for that level. """ assert self.num_levels == len(featmap_sizes) multi_level_anchors = [] for i in range(self.num_levels): anchors = self.single_level_grid_anchors( featmap_sizes[i], self.scales[i], device=device) if self.reshape_out: anchors = anchors.reshape(-1, anchors.size(-1)) multi_level_anchors.append(anchors) return multi_level_anchors def single_level_grid_anchors(self, featmap_size, scale, device='cuda'): """Generate grid anchors of a single level feature map. This function is usually called by method ``self.grid_anchors``. Args: featmap_size (tuple[int]): Size of the feature map. scale (float): Scale factor of the anchors in the current level. device (str, optional): Device the tensor will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors in the overall feature map. """ # We reimplement the anchor generator using torch in cuda # torch: 0.6975 s for 1000 times # numpy: 4.3345 s for 1000 times # which is ~5 times faster than the numpy implementation if not self.size_per_range: return self.anchors_single_range( featmap_size, self.ranges[0], scale, self.sizes, self.rotations, device=device) mr_anchors = [] for anchor_range, anchor_size in zip(self.ranges, self.sizes): mr_anchors.append( self.anchors_single_range( featmap_size, anchor_range, scale, anchor_size, self.rotations, device=device)) mr_anchors = torch.cat(mr_anchors, dim=-3) return mr_anchors def anchors_single_range(self, feature_size, anchor_range, scale=1, sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). scale (float | int, optional): The scale factor of anchors. sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with shape [N, 3], in order of x, y, z. rotations (list[float] | np.ndarray | torch.Tensor): Rotations of anchors in a single feature grid. device (str): Devices that the anchors will be put on. Returns: torch.Tensor: Anchors with shape \ [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: feature_size = [1, feature_size[0], feature_size[1]] anchor_range = torch.tensor(anchor_range, device=device) z_centers = torch.linspace( anchor_range[2], anchor_range[5], feature_size[0], device=device) y_centers = torch.linspace( anchor_range[1], anchor_range[4], feature_size[1], device=device) x_centers = torch.linspace( anchor_range[0], anchor_range[3], feature_size[2], device=device) sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale rotations = torch.tensor(rotations, device=device) # torch.meshgrid default behavior is 'id', np's default is 'xy' rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations) # torch.meshgrid returns a tuple rather than list rets = list(rets) tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = sizes.repeat(tile_size_shape) rets.insert(3, sizes) ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) # [1, 200, 176, N, 2, 7] for kitti after permute if len(self.custom_values) > 0: custom_ndim = len(self.custom_values) custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) # custom[:] = self.custom_values ret = torch.cat([ret, custom], dim=-1) # [1, 200, 176, N, 2, 9] for nus dataset after permute return ret @ANCHOR_GENERATORS.register_module() class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator): """Aligned 3D Anchor Generator by range. This anchor generator uses a different manner to generate the positions of anchors' centers from :class:`Anchor3DRangeGenerator`. Note: The `align` means that the anchor's center is aligned with the voxel grid, which is also the feature grid. The previous implementation of :class:`Anchor3DRangeGenerator` does not generate the anchors' center according to the voxel grid. Rather, it generates the center by uniformly distributing the anchors inside the minimum and maximum anchor ranges according to the feature map sizes. However, this makes the anchors center does not match the feature grid. The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the feature map sizes to obtain the corners of the voxel grid. Then it shifts the coordinates to the center of voxel grid and use the left up corner to distribute anchors. Args: anchor_corner (bool): Whether to align with the corner of the voxel grid. By default it is False and the anchor's center will be the same as the corresponding voxel's center, which is also the center of the corresponding greature grid. """ def __init__(self, align_corner=False, **kwargs): super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs) self.align_corner = align_corner def anchors_single_range(self, feature_size, anchor_range, scale, sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). scale (float | int, optional): The scale factor of anchors. sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with shape [N, 3], in order of x, y, z. rotations (list[float] | np.ndarray | torch.Tensor): Rotations of anchors in a single feature grid. device (str): Devices that the anchors will be put on. Returns: torch.Tensor: Anchors with shape \ [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: feature_size = [1, feature_size[0], feature_size[1]] anchor_range = torch.tensor(anchor_range, device=device) z_centers = torch.linspace( anchor_range[2], anchor_range[5], feature_size[0] + 1, device=device) y_centers = torch.linspace( anchor_range[1], anchor_range[4], feature_size[1] + 1, device=device) x_centers = torch.linspace( anchor_range[0], anchor_range[3], feature_size[2] + 1, device=device) sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale rotations = torch.tensor(rotations, device=device) # shift the anchor center if not self.align_corner: z_shift = (z_centers[1] - z_centers[0]) / 2 y_shift = (y_centers[1] - y_centers[0]) / 2 x_shift = (x_centers[1] - x_centers[0]) / 2 z_centers += z_shift y_centers += y_shift x_centers += x_shift # torch.meshgrid default behavior is 'id', np's default is 'xy' rets = torch.meshgrid(x_centers[:feature_size[2]], y_centers[:feature_size[1]], z_centers[:feature_size[0]], rotations) # torch.meshgrid returns a tuple rather than list rets = list(rets) tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = sizes.repeat(tile_size_shape) rets.insert(3, sizes) ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) if len(self.custom_values) > 0: custom_ndim = len(self.custom_values) custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) # TODO: check the support of custom values # custom[:] = self.custom_values ret = torch.cat([ret, custom], dim=-1) return ret @ANCHOR_GENERATORS.register_module() class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator): """3D Anchor Generator by range for per class. This anchor generator generates anchors by the given range for per class. Note that feature maps of different classes may be different. Args: kwargs (dict): Arguments are the same as those in \ :class:`AlignedAnchor3DRangeGenerator`. """ def __init__(self, **kwargs): super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs) assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \ ' not supported currently in this kind of anchor generator.' def grid_anchors(self, featmap_sizes, device='cuda'): """Generate grid anchors in multiple feature levels. Args: featmap_sizes (list[tuple]): List of feature map sizes for \ different classes in a single feature level. device (str): Device where the anchors will be put on. Returns: list[list[torch.Tensor]]: Anchors in multiple feature levels. \ Note that in this anchor generator, we currently only \ support single feature level. The sizes of each tensor \ should be [num_sizes/ranges*num_rots*featmap_size, \ box_code_size]. """ multi_level_anchors = [] anchors = self.multi_cls_grid_anchors( featmap_sizes, self.scales[0], device=device) multi_level_anchors.append(anchors) return multi_level_anchors def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'): """Generate grid anchors of a single level feature map for multi-class with different feature map sizes. This function is usually called by method ``self.grid_anchors``. Args: featmap_sizes (list[tuple]): List of feature map sizes for \ different classes in a single feature level. scale (float): Scale factor of the anchors in the current level. device (str, optional): Device the tensor will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors in the overall feature map. """ assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \ 'The number of different feature map sizes anchor sizes and ' + \ 'ranges should be the same.' multi_cls_anchors = [] for i in range(len(featmap_sizes)): anchors = self.anchors_single_range( featmap_sizes[i], self.ranges[i], scale, self.sizes[i], self.rotations, device=device) # [*featmap_size, num_sizes/ranges, num_rots, box_code_size] ndim = len(featmap_sizes[i]) anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1)) # [*featmap_size, num_sizes/ranges*num_rots, box_code_size] anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1) # [num_sizes/ranges*num_rots, *featmap_size, box_code_size] multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1))) # [num_sizes/ranges*num_rots*featmap_size, box_code_size] return multi_cls_anchors ================================================ FILE: mmdet3d/core/bbox/__init__.py ================================================ from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner from .coders import DeltaXYZWLHRBBoxCoder # from .bbox_target import bbox_target from .iou_calculators import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, bbox_overlaps_nearest_3d) from .samplers import (BaseSampler, CombinedSampler, InstanceBalancedPosSampler, IoUBalancedNegSampler, PseudoSampler, RandomSampler, SamplingResult) from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes, get_box_type, limit_period, points_cam2img, xywhr2xyxyr) from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back __all__ = [ 'BaseSampler', 'AssignResult', 'BaseAssigner', 'MaxIoUAssigner', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult', 'DeltaXYZWLHRBBoxCoder', 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes', 'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img', 'get_box_type', 'Coord3DMode' ] ================================================ FILE: mmdet3d/core/bbox/assigners/__init__.py ================================================ from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner from .hungarian_assigner import HungarianAssigner3D, HeuristicAssigner3D, HungarianAssignerView2D, HungarianAssignerViewProj2D, HungarianAssignerCameraBox __all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult', 'HungarianAssigner3D', 'HeuristicAssigner', 'HungarianAssignerView2D', 'HungarianAssignerViewProj2D', 'HungarianAssignerCameraBox'] ================================================ FILE: mmdet3d/core/bbox/assigners/hungarian_assigner.py ================================================ from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.assigners import AssignResult, BaseAssigner from mmdet.core.bbox.match_costs import build_match_cost from mmdet.core.bbox.match_costs.builder import MATCH_COST from mmdet.core.bbox.iou_calculators import build_iou_calculator from mmdet.core.bbox.assigners import HungarianAssigner from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy import torch try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @MATCH_COST.register_module() class BBox3DL1Cost(object): def __init__(self, weight): self.weight = weight def __call__(self, bboxes, gt_bboxes, train_cfg=None): reg_cost = torch.cdist(bboxes, gt_bboxes, p=1) return reg_cost * self.weight @MATCH_COST.register_module() class BBoxBEVL1Cost(object): def __init__(self, weight): self.weight = weight def __call__(self, bboxes, gt_bboxes, train_cfg): pc_start = bboxes.new(train_cfg['point_cloud_range'][0:2]) pc_range = bboxes.new(train_cfg['point_cloud_range'][3:5]) - bboxes.new(train_cfg['point_cloud_range'][0:2]) # normalize the box center to [0, 1] normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1) return reg_cost * self.weight @MATCH_COST.register_module() class IoU3DCost(object): def __init__(self, weight): self.weight = weight def __call__(self, iou): iou_cost = - iou return iou_cost * self.weight @BBOX_ASSIGNERS.register_module() class HeuristicAssigner3D(BaseAssigner): def __init__(self, dist_thre=100, iou_calculator=dict(type='BboxOverlaps3D') ): self.dist_thre = dist_thre # distance in meter self.iou_calculator = build_iou_calculator(iou_calculator) def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None, query_labels=None): dist_thre = self.dist_thre num_gts, num_bboxes = len(gt_bboxes), len(bboxes) bev_dist = torch.norm(bboxes[:, 0:2][None, :, :] - gt_bboxes[:, 0:2][:, None, :], dim=-1) # [num_gts, num_bboxes] if query_labels is not None: # only match the gt box and query with same category not_same_class = (query_labels[None] != gt_labels[:, None]) bev_dist += not_same_class * dist_thre # for each gt box, assign it to the nearest pred box nearest_values, nearest_indices = bev_dist.min(1) # [num_gts] assigned_gt_inds = torch.ones([num_bboxes, ]).to(bboxes) * 0 assigned_gt_vals = torch.ones([num_bboxes, ]).to(bboxes) * 10000 assigned_gt_labels = torch.ones([num_bboxes, ]).to(bboxes) * -1 for idx_gts in range(num_gts): # for idx_pred in torch.where(bev_dist[idx_gts] < dist_thre)[0]: # each gt match to all the pred box within some radius idx_pred = nearest_indices[idx_gts] # each gt only match to the nearest pred box if bev_dist[idx_gts, idx_pred] <= dist_thre: if bev_dist[idx_gts, idx_pred] < assigned_gt_vals[idx_pred]: # if this pred box is assigned, then compare assigned_gt_vals[idx_pred] = bev_dist[idx_gts, idx_pred] assigned_gt_inds[idx_pred] = idx_gts + 1 # for AssignResult, 0 is negative, -1 is ignore, 1-based indices are positive assigned_gt_labels[idx_pred] = gt_labels[idx_gts] max_overlaps = torch.zeros([num_bboxes, ]).to(bboxes) matched_indices = torch.where(assigned_gt_inds > 0) matched_iou = self.iou_calculator(gt_bboxes[assigned_gt_inds[matched_indices].long() - 1], bboxes[matched_indices]).diag() max_overlaps[matched_indices] = matched_iou return AssignResult( num_gts, assigned_gt_inds.long(), max_overlaps, labels=assigned_gt_labels ) @BBOX_ASSIGNERS.register_module() class HungarianAssigner3D(BaseAssigner): def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0), iou_cost=dict(type='IoU3DCost', weight=1.0), iou_calculator=dict(type='BboxOverlaps3D') ): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = build_match_cost(iou_cost) self.iou_calculator = build_iou_calculator(iou_calculator) def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg): num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0) # 1. assign -1 by default assigned_gt_inds = bboxes.new_full((num_bboxes,), -1, dtype=torch.long) assigned_labels = bboxes.new_full((num_bboxes,), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels) # return AssignResult( # num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # see mmdetection/mmdet/core/bbox/match_costs/match_cost.py cls_cost = self.cls_cost(cls_pred[0].T, gt_labels) reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg) iou = self.iou_calculator(bboxes, gt_bboxes) iou_cost = self.iou_cost(iou) # weighted sum of above three costs cost = cls_cost + reg_cost + iou_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device) matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] max_overlaps = torch.zeros_like(iou.max(1).values) max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds] # max_overlaps = iou.max(1).values return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) @BBOX_ASSIGNERS.register_module() class HungarianAssignerView2D(HungarianAssigner): def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=1.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)): super(HungarianAssignerView2D, self).__init__(cls_cost, reg_cost, iou_cost) self.view_cost = ViewCost() def assign(self, bbox_pred, cls_pred, view, gt_bboxes, gt_labels, img_w, img_h, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (cx, cy, w, h). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes) gt_views = gt_labels[..., 1] gt_labels = gt_labels[..., 0] # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) # 2. compute the weighted costs # classification and bboxcost. cls_cost = self.cls_cost(cls_pred, gt_labels) # regression L1 cost normalize_gt_bboxes = gt_bboxes / factor reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes) # regression iou cost, defaultly giou is used in official DETR. bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor iou_cost = self.iou_cost(bboxes, gt_bboxes) iou = -iou_cost / self.iou_cost.weight view_cost = self.view_cost(view, gt_views) # weighted sum of above three costs cost = cls_cost + reg_cost + iou_cost + view_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] max_overlaps = torch.zeros_like(iou.max(1).values) max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) @BBOX_ASSIGNERS.register_module() class HungarianAssignerViewProj2D(HungarianAssigner): def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=1.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)): super(HungarianAssignerViewProj2D, self).__init__(cls_cost, reg_cost, iou_cost) self.view_cost = ViewCost() def assign(self, bbox_pred, cls_pred, center_pred, offset_pred, view, gt_bboxes, gt_labels, gt_centers, gt_offsets, img_w, img_h, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (cx, cy, w, h). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes) gt_views = gt_labels[..., 1] gt_labels = gt_labels[..., 0] # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels) factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) # 2. compute the weighted costs # classification and bboxcost. cls_cost = self.cls_cost(cls_pred, gt_labels) # regression L1 cost # reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes) normalize_gt_centers = gt_centers / factor[:, :2] reg_cost = self.reg_cost(center_pred, normalize_gt_centers) normalize_gt_offsets = gt_offsets / factor # reg_cost = reg_cost + self.reg_cost(offset_pred, normalize_gt_offsets) / 2 reg_cost = reg_cost + self.reg_cost(offset_pred, normalize_gt_offsets) # regression iou cost, defaultly giou is used in official DETR. bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor iou_cost = self.iou_cost(bboxes, gt_bboxes) iou = -iou_cost / self.iou_cost.weight view_cost = self.view_cost(view, gt_views) # weighted sum of above three costs cost = cls_cost + reg_cost + iou_cost + view_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] max_overlaps = torch.zeros_like(iou.max(1).values) max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) class ViewCost: def __init__(self, weight=1000): self.weight = weight def __call__(self, view_pred, gt_views): view_cost = torch.cdist(view_pred.unsqueeze(-1).float(), gt_views.unsqueeze(-1).float(), p=1) view_cost = torch.clamp(view_cost, max=1, min=0) return view_cost * self.weight @BBOX_ASSIGNERS.register_module() class HungarianAssignerCameraBox(BaseAssigner): def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0), iou_cost=dict(type='IoU3DCost', weight=1.0), iou_calculator=dict(type='BboxOverlaps3D') ): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = build_match_cost(iou_cost) self.iou_calculator = build_iou_calculator(iou_calculator) self.view_cost = ViewCost() def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, view, train_cfg): num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0) # 1. assign -1 by default assigned_gt_inds = bboxes.new_full((num_bboxes,), -1, dtype=torch.long) assigned_labels = bboxes.new_full((num_bboxes,), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels) # 2. compute the weighted costs # see mmdetection/mmdet/core/bbox/match_costs/match_cost.py gt_views = gt_labels[..., 1] gt_labels = gt_labels[..., 0] cls_cost = self.cls_cost(cls_pred[0].T, gt_labels) reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg) iou = self.iou_calculator(bboxes, gt_bboxes) iou_cost = self.iou_cost(iou) view_cost = self.view_cost(view, gt_views) # weighted sum of above three costs cost = cls_cost + reg_cost + iou_cost + view_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device) matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] max_overlaps = torch.zeros_like(iou.max(1).values) max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds] # max_overlaps = iou.max(1).values return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) ================================================ FILE: mmdet3d/core/bbox/box_np_ops.py ================================================ # TODO: clean the functions in this file and move the APIs into box structures # in the future import numba import numpy as np def camera_to_lidar(points, r_rect, velo2cam): """Convert points in camera coordinate to lidar coordinate. Args: points (np.ndarray, shape=[N, 3]): Points in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Points in lidar coordinate. """ points_shape = list(points.shape[0:-1]) if points.shape[-1] == 3: points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) return lidar_points[..., :3] def box_camera_to_lidar(data, r_rect, velo2cam): """Covert boxes in camera coordinate to lidar coordinate. Args: data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Boxes in lidar coordinate. """ xyz = data[:, 0:3] l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] r = data[:, 6:7] xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) return np.concatenate([xyz_lidar, w, l, h, r], axis=1) def corners_nd(dims, origin=0.5): """Generate relative box corners based on length per dim and origin point. Args: dims (np.ndarray, shape=[N, ndim]): Array of length per dim origin (list or array or float): origin point relate to smallest point. Returns: np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners. point layout example: (2d) x0y0, x0y1, x1y0, x1y1; (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 where x0 < x1, y0 < y1, z0 < z1. """ ndim = int(dims.shape[1]) corners_norm = np.stack( np.unravel_index(np.arange(2**ndim), [2] * ndim), axis=1).astype(dims.dtype) # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 # so need to convert to a format which is convenient to do other computing. # for 2d boxes, format is clockwise start with minimum point # for 3d boxes, please draw lines by your hand. if ndim == 2: # generate clockwise box corners corners_norm = corners_norm[[0, 1, 3, 2]] elif ndim == 3: corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( [1, 2**ndim, ndim]) return corners def rotation_2d(points, angles): """Rotation 2d points based on origin point clockwise when angle positive. Args: points (np.ndarray): Points to be rotated with shape \ (N, point_size, 2). angles (np.ndarray): Rotation angle with shape (N). Returns: np.ndarray: Same shape as points. """ rot_sin = np.sin(angles) rot_cos = np.cos(angles) rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) return np.einsum('aij,jka->aik', points, rot_mat_T) def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): """Convert kitti locations, dimensions and angles to corners. format: center(xy), dims(xy), angles(clockwise when positive) Args: centers (np.ndarray): Locations in kitti label file with shape (N, 2). dims (np.ndarray): Dimensions in kitti label file with shape (N, 2). angles (np.ndarray): Rotation_y in kitti label file with shape (N). Returns: np.ndarray: Corners with the shape of (N, 4, 2). """ # 'length' in kitti format is in x axis. # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 4, 2] if angles is not None: corners = rotation_2d(corners, angles) corners += centers.reshape([-1, 1, 2]) return corners @numba.jit(nopython=True) def depth_to_points(depth, trunc_pixel): """Convert depth map to points. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. Returns: np.ndarray: Points in camera coordinates. """ num_pts = np.sum(depth[trunc_pixel:, ] > 0.1) points = np.zeros((num_pts, 3), dtype=depth.dtype) x = np.array([0, 0, 1], dtype=depth.dtype) k = 0 for i in range(trunc_pixel, depth.shape[0]): for j in range(depth.shape[1]): if depth[i, j] > 0.1: x = np.array([j, i, 1], dtype=depth.dtype) points[k] = x * depth[i, j] k += 1 return points def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): """Convert depth map to points in lidar coordinate. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray: Points in lidar coordinates. """ pts = depth_to_points(depth, trunc_pixel) points_shape = list(pts.shape[0:-1]) points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1) points = points @ np.linalg.inv(P2.T) lidar_points = camera_to_lidar(points, r_rect, velo2cam) return lidar_points def rotation_3d_in_axis(points, angles, axis=0): """Rotate points in specific axis. Args: points (np.ndarray, shape=[N, point_size, 3]]): angles (np.ndarray, shape=[N]]): axis (int): Axis to rotate at. Returns: np.ndarray: Rotated points. """ # points: [N, point_size, 3] rot_sin = np.sin(angles) rot_cos = np.cos(angles) ones = np.ones_like(rot_cos) zeros = np.zeros_like(rot_cos) if axis == 1: rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros], [rot_sin, zeros, rot_cos]]) elif axis == 2 or axis == -1: rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros], [rot_sin, rot_cos, zeros], [zeros, zeros, ones]]) elif axis == 0: rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin], [zeros, rot_sin, rot_cos], [ones, zeros, zeros]]) else: raise ValueError('axis should in range') return np.einsum('aij,jka->aik', points, rot_mat_T) def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 1.0, 0.5), axis=1): """Convert kitti locations, dimensions and angles to corners. Args: centers (np.ndarray): Locations in kitti label file with shape (N, 3). dims (np.ndarray): Dimensions in kitti label file with shape (N, 3). angles (np.ndarray): Rotation_y in kitti label file with shape (N). origin (list or array or float): Origin point relate to smallest point. use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. axis (int): Rotation axis. 1 for camera and 2 for lidar. Returns: np.ndarray: Corners with the shape of (N, 8, 3). """ # 'length' in kitti format is in x axis. # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 8, 3] if angles is not None: corners = rotation_3d_in_axis(corners, angles, axis=axis) corners += centers.reshape([-1, 1, 3]) return corners @numba.jit(nopython=True) def box2d_to_corner_jit(boxes): """Convert box2d to corner. Args: boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation. Returns: box_corners (np.ndarray, shape=[N, 4, 2]): Box corners. """ num_box = boxes.shape[0] corners_norm = np.zeros((4, 2), dtype=boxes.dtype) corners_norm[1, 1] = 1.0 corners_norm[2] = 1.0 corners_norm[3, 0] = 1.0 corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape( 1, 4, 2) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) for i in range(num_box): rot_sin = np.sin(boxes[i, -1]) rot_cos = np.cos(boxes[i, -1]) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = -rot_sin rot_mat_T[1, 0] = rot_sin rot_mat_T[1, 1] = rot_cos box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] return box_corners @numba.njit def corner_to_standup_nd_jit(boxes_corner): """Convert boxes_corner to aligned (min-max) boxes. Args: boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners. Returns: np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes. """ num_boxes = boxes_corner.shape[0] ndim = boxes_corner.shape[-1] result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) for i in range(num_boxes): for j in range(ndim): result[i, j] = np.min(boxes_corner[i, :, j]) for j in range(ndim): result[i, j + ndim] = np.max(boxes_corner[i, :, j]) return result @numba.jit(nopython=True) def corner_to_surfaces_3d_jit(corners): """Convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3d box corners with the shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module num_boxes = corners.shape[0] surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) corner_idxes = np.array([ 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7 ]).reshape(6, 4) for i in range(num_boxes): for j in range(6): for k in range(4): surfaces[i, j, k] = corners[i, corner_idxes[j, k]] return surfaces def rotation_points_single_angle(points, angle, axis=0): """Rotate points with a single angle. Args: points (np.ndarray, shape=[N, 3]]): angles (np.ndarray, shape=[1]]): axis (int): Axis to rotate at. Returns: np.ndarray: Rotated points. """ # points: [N, 3] rot_sin = np.sin(angle) rot_cos = np.cos(angle) if axis == 1: rot_mat_T = np.array( [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], dtype=points.dtype) elif axis == 2 or axis == -1: rot_mat_T = np.array( [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], dtype=points.dtype) elif axis == 0: rot_mat_T = np.array( [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], dtype=points.dtype) else: raise ValueError('axis should in range') return points @ rot_mat_T, rot_mat_T def points_cam2img(points_3d, proj_mat, with_depth=False): """Project points in camera coordinates to image coordinates. Args: points_3d (np.ndarray): Points in shape (N, 3) proj_mat (np.ndarray): Transformation matrix between coordinates. Returns: np.ndarray: Points in image coordinates with shape [N, 2]. """ points_shape = list(points_3d.shape) points_shape[-1] = 1 points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1) assert len(proj_mat.shape) == 2, 'The dimension of the projection'\ f' matrix should be 2 instead of {len(proj_mat.shape)}.' d1, d2 = proj_mat.shape[:2] assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( d1 == 4 and d2 == 4), 'The shape of the projection matrix'\ f' ({d1}*{d2}) is not supported.' if d1 == 3: proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype) proj_mat_expanded[:d1, :d2] = proj_mat proj_mat = proj_mat_expanded point_2d = points_4 @ proj_mat.T point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] if with_depth: points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]], axis=-1) return points_2d_depth return point_2d_res def box3d_to_bbox(box3d, P2): """Convert box3d in camera coordinates to bbox in image coordinates. Args: box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. P2 (np.array, shape=[4, 4]): Intrinsics of Camera2. Returns: np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates. """ box_corners = center_to_corner_box3d( box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1) box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = np.min(box_corners_in_image, axis=1) maxxy = np.max(box_corners_in_image, axis=1) bbox = np.concatenate([minxy, maxxy], axis=1) return bbox def corner_to_surfaces_3d(corners): """convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3D box corners with shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module surfaces = np.array([ [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], ]).transpose([2, 0, 1, 3]) return surfaces def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): """Check points in rotated bbox and return indicces. Args: points (np.ndarray, shape=[N, 3+dim]): Points to query. rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation. z_axis (int): Indicate which axis is height. origin (tuple[int]): Indicate the position of box center. Returns: np.ndarray, shape=[N, M]: Indices of points in each box. """ # TODO: this function is different from PointCloud3D, be careful # when start to use nuscene, check the input rbbox_corners = center_to_corner_box3d( rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis) surfaces = corner_to_surfaces_3d(rbbox_corners) indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) return indices def minmax_to_corner_2d(minmax_box): """Convert minmax box to corners2d. Args: minmax_box (np.ndarray, shape=[N, dims]): minmax boxes. Returns: np.ndarray: 2d corners of boxes """ ndim = minmax_box.shape[-1] // 2 center = minmax_box[..., :ndim] dims = minmax_box[..., ndim:] - center return center_to_corner_box2d(center, dims, origin=0.0) def limit_period(val, offset=0.5, period=np.pi): """Limit the value into a period for periodic function. Args: val (np.ndarray): The value to be converted. offset (float, optional): Offset to set the value range. \ Defaults to 0.5. period (float, optional): Period of the value. Defaults to np.pi. Returns: torch.Tensor: Value in the range of \ [-offset * period, (1-offset) * period] """ return val - np.floor(val / period + offset) * period def create_anchors_3d_range(feature_size, anchor_range, sizes=((1.6, 3.9, 1.56), ), rotations=(0, np.pi / 2), dtype=np.float32): """Create anchors 3d by range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with shape [N, 3], in order of x, y, z. rotations (list[float] | np.ndarray | torch.Tensor): Rotations of anchors in a single feature grid. dtype (type): Data type. Default to np.float32. Returns: np.ndarray: Range based anchors with shape of \ (*feature_size, num_sizes, num_rots, 7). """ anchor_range = np.array(anchor_range, dtype) z_centers = np.linspace( anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype) y_centers = np.linspace( anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype) x_centers = np.linspace( anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype) sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) rotations = np.array(rotations, dtype=dtype) rets = np.meshgrid( x_centers, y_centers, z_centers, rotations, indexing='ij') tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape) rets[i] = rets[i][..., np.newaxis] # for concat sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = np.tile(sizes, tile_size_shape) rets.insert(3, sizes) ret = np.concatenate(rets, axis=-1) return np.transpose(ret, [2, 1, 0, 3, 4, 5]) def center_to_minmax_2d(centers, dims, origin=0.5): """Center to minmax. Args: centers (np.ndarray): Center points. dims (np.ndarray): Dimensions. origin (list or array or float): origin point relate to smallest point. Returns: np.ndarray: Minmax points. """ if origin == 0.5: return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) corners = center_to_corner_box2d(centers, dims, origin=origin) return corners[:, [0, 2]].reshape([-1, 4]) def rbbox2d_to_near_bbox(rbboxes): """convert rotated bbox to nearest 'standing' or 'lying' bbox. Args: rbboxes (np.ndarray): Rotated bboxes with shape of \ (N, 5(x, y, xdim, ydim, rad)). Returns: np.ndarray: Bounding boxes with the shpae of (N, 4(xmin, ymin, xmax, ymax)). """ rots = rbboxes[..., -1] rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) return bboxes @numba.jit(nopython=True) def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): """Calculate box iou. Note that jit version runs ~10x faster than the box_overlaps function in mmdet3d.core.evaluation. Args: boxes (np.ndarray): Input bounding boxes with shape of (N, 4). query_boxes (np.ndarray): Query boxes with shape of (K, 4). Returns: np.ndarray: Overlap between boxes and query_boxes with the shape of [N, K]. """ N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) * (query_boxes[k, 3] - query_boxes[k, 1] + eps)) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + eps) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + eps) if ih > 0: if mode == 'iou': ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps) + box_area - iw * ih) else: ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps)) overlaps[n, k] = iw * ih / ua return overlaps def projection_matrix_to_CRT_kitti(proj): """Split projection matrix of kitti. P = C @ [R|T] C is upper triangular matrix, so we need to inverse CR and use QR stable for all kitti camera projection matrix. Args: proj (p.array, shape=[4, 4]): Intrinsics of camera. Returns: tuple[np.ndarray]: Splited matrix of C, R and T. """ CR = proj[0:3, 0:3] CT = proj[0:3, 3] RinvCinv = np.linalg.inv(CR) Rinv, Cinv = np.linalg.qr(RinvCinv) C = np.linalg.inv(Cinv) R = np.linalg.inv(Rinv) T = Cinv @ CT return C, R, T def remove_outside_points(points, rect, Trv2c, P2, image_shape): """Remove points which are outside of image. Args: points (np.ndarray, shape=[N, 3+dims]): Total points. rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. image_shape (list[int]): Shape of image. Returns: np.ndarray, shape=[N, 3+dims]: Filtered points. """ # 5x faster than remove_outside_points_v1(2ms vs 10ms) C, R, T = projection_matrix_to_CRT_kitti(P2) image_bbox = [0, 0, image_shape[1], image_shape[0]] frustum = get_frustum(image_bbox, C) frustum -= T frustum = np.linalg.inv(R) @ frustum.T frustum = camera_to_lidar(frustum.T, rect, Trv2c) frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) points = points[indices.reshape([-1])] return points def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): """Get frustum corners in camera coordinates. Args: bbox_image (list[int]): box in image coordinates. C (np.ndarray): Intrinsics. near_clip (float): Nearest distance of frustum. far_clip (float): Farthest distance of frustum. Returns: np.ndarray, shape=[8, 3]: coordinates of frustum corners. """ fku = C[0, 0] fkv = -C[1, 1] u0v0 = C[0:2, 2] z_points = np.array( [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] b = bbox_image box_corners = np.array( [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype) near_box_corners = (box_corners - u0v0) / np.array( [fku / near_clip, -fkv / near_clip], dtype=C.dtype) far_box_corners = (box_corners - u0v0) / np.array( [fku / far_clip, -fkv / far_clip], dtype=C.dtype) ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] ret_xyz = np.concatenate([ret_xy, z_points], axis=1) return ret_xyz def surface_equ_3d(polygon_surfaces): """ Args: polygon_surfaces (np.ndarray): Polygon surfaces with shape of [num_polygon, max_num_surfaces, max_num_points_of_surface, 3]. All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. Returns: tuple: normal vector and its direction. """ # return [a, b, c], d in ax+by+cz+d=0 # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] surface_vec = polygon_surfaces[:, :, :2, :] - \ polygon_surfaces[:, :, 1:3, :] # normal_vec: [..., 3] normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :]) # print(normal_vec.shape, points[..., 0, :].shape) # d = -np.inner(normal_vec, points[..., 0, :]) d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :]) return normal_vec, -d @numba.njit def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces): """ Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. normal_vec (np.ndarray): Normal vector of polygon_surfaces. d (int): Directions of normal vector. num_surfaces (np.ndarray): Number of surfaces a polygon contains shape of (num_polygon). Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] ret = np.ones((num_points, num_polygons), dtype=np.bool_) sign = 0.0 for i in range(num_points): for j in range(num_polygons): for k in range(max_num_surfaces): if k > num_surfaces[j]: break sign = ( points[i, 0] * normal_vec[j, k, 0] + points[i, 1] * normal_vec[j, k, 1] + points[i, 2] * normal_vec[j, k, 2] + d[j, k]) if sign >= 0: ret[i, j] = False break return ret def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): """Check points is in 3d convex polygons. Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of \ (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). \ All surfaces' normal vector must direct to internal. \ Max_num_points_of_surface must at least 3. num_surfaces (np.ndarray): Number of surfaces a polygon contains \ shape of (num_polygon). Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] # num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] if num_surfaces is None: num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64) normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :]) # normal_vec: [num_polygon, max_num_surfaces, 3] # d: [num_polygon, max_num_surfaces] return _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces) @numba.jit def points_in_convex_polygon_jit(points, polygon, clockwise=True): """Check points is in 2d convex polygons. True when point in polygon. Args: points (np.ndarray): Input points with the shape of [num_points, 2]. polygon (np.ndarray): Input polygon with the shape of [num_polygon, num_points_of_polygon, 2]. clockwise (bool): Indicate polygon is clockwise. Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ # first convert polygon to directed lines num_points_of_polygon = polygon.shape[1] num_points = points.shape[0] num_polygons = polygon.shape[0] # if clockwise: # vec1 = polygon - polygon[:, [num_points_of_polygon - 1] + # list(range(num_points_of_polygon - 1)), :] # else: # vec1 = polygon[:, [num_points_of_polygon - 1] + # list(range(num_points_of_polygon - 1)), :] - polygon # vec1: [num_polygon, num_points_of_polygon, 2] vec1 = np.zeros((2), dtype=polygon.dtype) ret = np.zeros((num_points, num_polygons), dtype=np.bool_) success = True cross = 0.0 for i in range(num_points): for j in range(num_polygons): success = True for k in range(num_points_of_polygon): if clockwise: vec1 = polygon[j, k] - polygon[j, k - 1] else: vec1 = polygon[j, k - 1] - polygon[j, k] cross = vec1[1] * (polygon[j, k, 0] - points[i, 0]) cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1]) if cross >= 0: success = False break ret[i, j] = success return ret def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): """Convert kitti center boxes to corners. 7 -------- 4 /| /| 6 -------- 5 . | | | | . 3 -------- 0 |/ |/ 2 -------- 1 Args: boxes3d (np.ndarray): Boxes with shape of (N, 7) \ [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry \ in KITTI dataset. bottom_center (bool): Whether z is on the bottom center of object. Returns: np.ndarray: Box corners with the shape of [N, 8, 3]. """ boxes_num = boxes3d.shape[0] w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] x_corners = np.array( [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.], dtype=np.float32).T y_corners = np.array( [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.], dtype=np.float32).T if bottom_center: z_corners = np.zeros((boxes_num, 8), dtype=np.float32) z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1) # (N, 8) else: z_corners = np.array([ -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2. ], dtype=np.float32).T ry = boxes3d[:, 6] zeros, ones = np.zeros( ry.size, dtype=np.float32), np.ones( ry.size, dtype=np.float32) rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros], [np.sin(ry), np.cos(ry), zeros], [zeros, zeros, ones]]) # (3, 3, N) R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3) temp_corners = np.concatenate((x_corners.reshape( -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)), axis=2) # (N, 8, 3) rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3) x_corners = rotated_corners[:, :, 0] y_corners = rotated_corners[:, :, 1] z_corners = rotated_corners[:, :, 2] x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2] x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8) y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8) z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8) corners = np.concatenate( (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)), axis=2) return corners.astype(np.float32) ================================================ FILE: mmdet3d/core/bbox/coders/__init__.py ================================================ from mmdet.core.bbox import build_bbox_coder from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder from .transfusion_bbox_coder import TransFusionBBoxCoder from .camera_bbox_coder import CameraBBoxCoder __all__ = [ 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'TransFusionBBoxCoder', 'CameraBBoxCoder' ] ================================================ FILE: mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py ================================================ import numpy as np import torch from mmdet.core.bbox.builder import BBOX_CODERS from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder @BBOX_CODERS.register_module() class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder): """Anchor free bbox coder for 3D boxes. Args: num_dir_bins (int): Number of bins to encode direction angle. with_rot (bool): Whether the bbox is with rotation. """ def __init__(self, num_dir_bins, with_rot=True): super(AnchorFreeBBoxCoder, self).__init__( num_dir_bins, 0, [], with_rot=with_rot) self.num_dir_bins = num_dir_bins self.with_rot = with_rot def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \ with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. Returns: tuple: Targets of center, size and direction. """ # generate center target center_target = gt_bboxes_3d.gravity_center # generate bbox size target size_res_target = gt_bboxes_3d.dims / 2 # generate dir target box_num = gt_labels_3d.shape[0] if self.with_rot: (dir_class_target, dir_res_target) = self.angle2class(gt_bboxes_3d.yaw) dir_res_target /= (2 * np.pi / self.num_dir_bins) else: dir_class_target = gt_labels_3d.new_zeros(box_num) dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num) return (center_target, size_res_target, dir_class_target, dir_res_target) def decode(self, bbox_out): """Decode predicted parts to bbox3d. Args: bbox_out (dict): Predictions from model, should contain keys below. - center: predicted bottom center of bboxes. - dir_class: predicted bbox direction class. - dir_res: predicted bbox direction residual. - size: predicted bbox size. Returns: torch.Tensor: Decoded bbox3d with shape (batch, n, 7). """ center = bbox_out['center'] batch_size, num_proposal = center.shape[:2] # decode heading angle if self.with_rot: dir_class = torch.argmax(bbox_out['dir_class'], -1) dir_res = torch.gather(bbox_out['dir_res'], 2, dir_class.unsqueeze(-1)) dir_res.squeeze_(2) dir_angle = self.class2angle(dir_class, dir_res).reshape( batch_size, num_proposal, 1) else: dir_angle = center.new_zeros(batch_size, num_proposal, 1) # decode bbox size bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1) bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1) return bbox3d def split_pred(self, cls_preds, reg_preds, base_xyz): """Split predicted features to specific parts. Args: cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. Returns: dict[str, torch.Tensor]: Split results. """ results = {} results['obj_scores'] = cls_preds start, end = 0, 0 reg_preds_trans = reg_preds.transpose(2, 1) # decode center end += 3 # (batch_size, num_proposal, 3) results['center_offset'] = reg_preds_trans[..., start:end] results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end] start = end # decode center end += 3 # (batch_size, num_proposal, 3) results['size'] = reg_preds_trans[..., start:end] start = end # decode direction end += self.num_dir_bins results['dir_class'] = reg_preds_trans[..., start:end] start = end end += self.num_dir_bins dir_res_norm = reg_preds_trans[..., start:end] start = end results['dir_res_norm'] = dir_res_norm results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins) return results ================================================ FILE: mmdet3d/core/bbox/coders/camera_bbox_coder.py ================================================ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class CameraBBoxCoder(BaseBBoxCoder): def __init__(self, code_size=8): self.code_size = code_size def encode(self, dst_boxes): targets = torch.zeros([dst_boxes.shape[0], self.code_size]).to(dst_boxes.device) targets[:, 3] = dst_boxes[:, 3].log() targets[:, 4] = dst_boxes[:, 4].log() targets[:, 5] = dst_boxes[:, 5].log() targets[:, 6] = torch.sin(dst_boxes[:, 6]) targets[:, 7] = torch.cos(dst_boxes[:, 6]) targets[:, 0] = dst_boxes[:, 0] targets[:, 1] = dst_boxes[:, 1] - 0.5 * dst_boxes[:, 4] targets[:, 2] = dst_boxes[:, 2] if self.code_size == 10: targets[:, 8:10] = dst_boxes[:, 7:] return targets def decode(self, cls, rot, dim, center, vel): """Decode bboxes. Args: cls (torch.Tensor): Heatmap with the shape of [B, num_cls, num_proposals]. rot (torch.Tensor): Rotation with the shape of [B, 2, num_proposals]. dim (torch.Tensor): Dim of the boxes with the shape of [B, 3, num_proposals]. center (torch.Tensor): bev center of the boxes with the shape of [B, 3, num_proposals]. (in feature map metric) vel (torch.Tensor): Velocity with the shape of [B, 2, num_proposals]. Returns: list[dict]: Decoded boxes. """ # class label final_preds = cls.max(1, keepdims=False).indices final_scores = cls.max(1, keepdims=False).values dim[:, 0, :] = dim[:, 0, :].exp() dim[:, 1, :] = dim[:, 1, :].exp() dim[:, 2, :] = dim[:, 2, :].exp() # dim = torch.exp(dim) rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :] rot = torch.atan2(rots, rotc) center = center.clone() center[:, 1, :] = center[:, 1, :] + 0.5 * dim[:, 1, :] if vel is None: final_box_preds = torch.cat([center, dim, rot], dim=1).permute(0, 2, 1) else: final_box_preds = torch.cat([center, dim, rot, vel], dim=1).permute(0, 2, 1) predictions_dicts = [] for i in range(cls.shape[0]): boxes3d = final_box_preds[i] scores = final_scores[i] labels = final_preds[i] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels } predictions_dicts.append(predictions_dict) return predictions_dicts @staticmethod def decode_yaw(bbox, centers2d, cam2img): bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2], cam2img[0, 0]) + bbox[:, 6] return bbox ================================================ FILE: mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py ================================================ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class CenterPointBBoxCoder(BaseBBoxCoder): """Bbox coder for CenterPoint. Args: pc_range (list[float]): Range of point cloud. out_size_factor (int): Downsample factor of the model. voxel_size (list[float]): Size of voxel. post_center_range (list[float]): Limit of the center. Default: None. max_num (int): Max number to be kept. Default: 100. score_threshold (float): Threshold to filter boxes based on score. Default: None. code_size (int): Code size of bboxes. Default: 9 """ def __init__(self, pc_range, out_size_factor, voxel_size, post_center_range=None, max_num=100, score_threshold=None, code_size=9): self.pc_range = pc_range self.out_size_factor = out_size_factor self.voxel_size = voxel_size self.post_center_range = post_center_range self.max_num = max_num self.score_threshold = score_threshold self.code_size = code_size def _gather_feat(self, feats, inds, feat_masks=None): """Given feats and indexes, returns the gathered feats. Args: feats (torch.Tensor): Features to be transposed and gathered with the shape of [B, 2, W, H]. inds (torch.Tensor): Indexes with the shape of [B, N]. feat_masks (torch.Tensor): Mask of the feats. Default: None. Returns: torch.Tensor: Gathered feats. """ dim = feats.size(2) inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim) feats = feats.gather(1, inds) if feat_masks is not None: feat_masks = feat_masks.unsqueeze(2).expand_as(feats) feats = feats[feat_masks] feats = feats.view(-1, dim) return feats def _topk(self, scores, K=80): """Get indexes based on scores. Args: scores (torch.Tensor): scores with the shape of [B, N, W, H]. K (int): Number to be kept. Defaults to 80. Returns: tuple[torch.Tensor] torch.Tensor: Selected scores with the shape of [B, K]. torch.Tensor: Selected indexes with the shape of [B, K]. torch.Tensor: Selected classes with the shape of [B, K]. torch.Tensor: Selected y coord with the shape of [B, K]. torch.Tensor: Selected x coord with the shape of [B, K]. """ batch, cat, height, width = scores.size() topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) topk_inds = topk_inds % (height * width) topk_ys = (topk_inds.float() / torch.tensor(width, dtype=torch.float)).int().float() topk_xs = (topk_inds % width).int().float() topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int() topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) return topk_score, topk_inds, topk_clses, topk_ys, topk_xs def _transpose_and_gather_feat(self, feat, ind): """Given feats and indexes, returns the transposed and gathered feats. Args: feat (torch.Tensor): Features to be transposed and gathered with the shape of [B, 2, W, H]. ind (torch.Tensor): Indexes with the shape of [B, N]. Returns: torch.Tensor: Transposed and gathered feats. """ feat = feat.permute(0, 2, 3, 1).contiguous() feat = feat.view(feat.size(0), -1, feat.size(3)) feat = self._gather_feat(feat, ind) return feat def encode(self): pass def decode(self, heat, rot_sine, rot_cosine, hei, dim, vel, reg=None, task_id=-1): """Decode bboxes. Args: heat (torch.Tensor): Heatmap with the shape of [B, N, W, H]. rot_sine (torch.Tensor): Sine of rotation with the shape of [B, 1, W, H]. rot_cosine (torch.Tensor): Cosine of rotation with the shape of [B, 1, W, H]. hei (torch.Tensor): Height of the boxes with the shape of [B, 1, W, H]. dim (torch.Tensor): Dim of the boxes with the shape of [B, 1, W, H]. vel (torch.Tensor): Velocity with the shape of [B, 1, W, H]. reg (torch.Tensor): Regression value of the boxes in 2D with the shape of [B, 2, W, H]. Default: None. task_id (int): Index of task. Default: -1. Returns: list[dict]: Decoded boxes. """ batch, cat, _, _ = heat.size() scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num) if reg is not None: reg = self._transpose_and_gather_feat(reg, inds) reg = reg.view(batch, self.max_num, 2) xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1] ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2] else: xs = xs.view(batch, self.max_num, 1) + 0.5 ys = ys.view(batch, self.max_num, 1) + 0.5 # rotation value and direction label rot_sine = self._transpose_and_gather_feat(rot_sine, inds) rot_sine = rot_sine.view(batch, self.max_num, 1) rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds) rot_cosine = rot_cosine.view(batch, self.max_num, 1) rot = torch.atan2(rot_sine, rot_cosine) # height in the bev hei = self._transpose_and_gather_feat(hei, inds) hei = hei.view(batch, self.max_num, 1) # dim of the box dim = self._transpose_and_gather_feat(dim, inds) dim = dim.view(batch, self.max_num, 3) # class label clses = clses.view(batch, self.max_num).float() scores = scores.view(batch, self.max_num) xs = xs.view( batch, self.max_num, 1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0] ys = ys.view( batch, self.max_num, 1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1] if vel is None: # KITTI FORMAT final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2) else: # exist velocity, nuscene format vel = self._transpose_and_gather_feat(vel, inds) vel = vel.view(batch, self.max_num, 2) final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2) final_scores = scores final_preds = clses # use score threshold if self.score_threshold is not None: thresh_mask = final_scores > self.score_threshold if self.post_center_range is not None: self.post_center_range = torch.tensor( self.post_center_range, device=heat.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(2) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(2) predictions_dicts = [] for i in range(batch): cmask = mask[i, :] if self.score_threshold: cmask &= thresh_mask[i] boxes3d = final_box_preds[i, cmask] scores = final_scores[i, cmask] labels = final_preds[i, cmask] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels } predictions_dicts.append(predictions_dict) else: raise NotImplementedError( 'Need to reorganize output as a batch, only ' 'support post_center_range is not None for now!') return predictions_dicts ================================================ FILE: mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py ================================================ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder): """Bbox Coder for 3D boxes. Args: code_size (int): The dimension of boxes to be encoded. """ def __init__(self, code_size=7): super(DeltaXYZWLHRBBoxCoder, self).__init__() self.code_size = code_size @staticmethod def encode(src_boxes, dst_boxes): """Get box regression transformation deltas (dx, dy, dz, dw, dh, dl, dr, dv*) that can be used to transform the `src_boxes` into the `target_boxes`. Args: src_boxes (torch.Tensor): source boxes, e.g., object proposals. dst_boxes (torch.Tensor): target of the transformation, e.g., ground-truth boxes. Returns: torch.Tensor: Box transformation deltas. """ box_ndim = src_boxes.shape[-1] cas, cgs, cts = [], [], [] if box_ndim > 7: xa, ya, za, wa, la, ha, ra, *cas = torch.split( src_boxes, 1, dim=-1) xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split( dst_boxes, 1, dim=-1) cts = [g - a for g, a in zip(cgs, cas)] else: xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1) xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1) za = za + ha / 2 zg = zg + hg / 2 diagonal = torch.sqrt(la**2 + wa**2) xt = (xg - xa) / diagonal yt = (yg - ya) / diagonal zt = (zg - za) / ha lt = torch.log(lg / la) wt = torch.log(wg / wa) ht = torch.log(hg / ha) rt = rg - ra return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1) @staticmethod def decode(anchors, deltas): """Apply transformation `deltas` (dx, dy, dz, dw, dh, dl, dr, dv*) to `boxes`. Args: anchors (torch.Tensor): Parameters of anchors with shape (N, 7). deltas (torch.Tensor): Encoded boxes with shape (N, 7+n) [x, y, z, w, l, h, r, velo*]. Returns: torch.Tensor: Decoded boxes. """ cas, cts = [], [] box_ndim = anchors.shape[-1] if box_ndim > 7: xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1) xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1) else: xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1) xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1) za = za + ha / 2 diagonal = torch.sqrt(la**2 + wa**2) xg = xt * diagonal + xa yg = yt * diagonal + ya zg = zt * ha + za lg = torch.exp(lt) * la wg = torch.exp(wt) * wa hg = torch.exp(ht) * ha rg = rt + ra zg = zg - hg / 2 cgs = [t + a for t, a in zip(cts, cas)] return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1) ================================================ FILE: mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py ================================================ import numpy as np import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class PartialBinBasedBBoxCoder(BaseBBoxCoder): """Partial bin based bbox coder. Args: num_dir_bins (int): Number of bins to encode direction angle. num_sizes (int): Number of size clusters. mean_sizes (list[list[int]]): Mean size of bboxes in each class. with_rot (bool): Whether the bbox is with rotation. """ def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True): super(PartialBinBasedBBoxCoder, self).__init__() assert len(mean_sizes) == num_sizes self.num_dir_bins = num_dir_bins self.num_sizes = num_sizes self.mean_sizes = mean_sizes self.with_rot = with_rot def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \ with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. Returns: tuple: Targets of center, size and direction. """ # generate center target center_target = gt_bboxes_3d.gravity_center # generate bbox size target size_class_target = gt_labels_3d size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor( self.mean_sizes)[size_class_target] # generate dir target box_num = gt_labels_3d.shape[0] if self.with_rot: (dir_class_target, dir_res_target) = self.angle2class(gt_bboxes_3d.yaw) else: dir_class_target = gt_labels_3d.new_zeros(box_num) dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num) return (center_target, size_class_target, size_res_target, dir_class_target, dir_res_target) def decode(self, bbox_out, suffix=''): """Decode predicted parts to bbox3d. Args: bbox_out (dict): Predictions from model, should contain keys below. - center: predicted bottom center of bboxes. - dir_class: predicted bbox direction class. - dir_res: predicted bbox direction residual. - size_class: predicted bbox size class. - size_res: predicted bbox size residual. suffix (str): Decode predictions with specific suffix. Returns: torch.Tensor: Decoded bbox3d with shape (batch, n, 7). """ center = bbox_out['center' + suffix] batch_size, num_proposal = center.shape[:2] # decode heading angle if self.with_rot: dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1) dir_res = torch.gather(bbox_out['dir_res' + suffix], 2, dir_class.unsqueeze(-1)) dir_res.squeeze_(2) dir_angle = self.class2angle(dir_class, dir_res).reshape( batch_size, num_proposal, 1) else: dir_angle = center.new_zeros(batch_size, num_proposal, 1) # decode bbox size size_class = torch.argmax( bbox_out['size_class' + suffix], -1, keepdim=True) size_res = torch.gather(bbox_out['size_res' + suffix], 2, size_class.unsqueeze(-1).repeat(1, 1, 1, 3)) mean_sizes = center.new_tensor(self.mean_sizes) size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1)) bbox_size = size_base.reshape(batch_size, num_proposal, -1) + size_res.squeeze(2) bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1) return bbox3d def decode_corners(self, center, size_res, size_class): """Decode center, size residuals and class to corners. Only useful for axis-aligned bounding boxes, so angle isn't considered. Args: center (torch.Tensor): Shape [B, N, 3] size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3] size_class (torch.Tensor): Shape: [B, N] or [B, N, 1] or [B, N, C, 3] Returns: torch.Tensor: Corners with shape [B, N, 6] """ if len(size_class.shape) == 2 or size_class.shape[-1] == 1: batch_size, proposal_num = size_class.shape[:2] one_hot_size_class = size_res.new_zeros( (batch_size, proposal_num, self.num_sizes)) if len(size_class.shape) == 2: size_class = size_class.unsqueeze(-1) one_hot_size_class.scatter_(2, size_class, 1) one_hot_size_class_expand = one_hot_size_class.unsqueeze( -1).repeat(1, 1, 1, 3).contiguous() else: one_hot_size_class_expand = size_class if len(size_res.shape) == 4: size_res = torch.sum(size_res * one_hot_size_class_expand, 2) mean_sizes = size_res.new_tensor(self.mean_sizes) mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2) size_full = (size_res + 1) * mean_sizes size_full = torch.clamp(size_full, 0) half_size_full = size_full / 2 corner1 = center - half_size_full corner2 = center + half_size_full corners = torch.cat([corner1, corner2], dim=-1) return corners def split_pred(self, cls_preds, reg_preds, base_xyz): """Split predicted features to specific parts. Args: cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. Returns: dict[str, torch.Tensor]: Split results. """ results = {} start, end = 0, 0 cls_preds_trans = cls_preds.transpose(2, 1) reg_preds_trans = reg_preds.transpose(2, 1) # decode center end += 3 # (batch_size, num_proposal, 3) results['center'] = base_xyz + \ reg_preds_trans[..., start:end].contiguous() start = end # decode direction end += self.num_dir_bins results['dir_class'] = reg_preds_trans[..., start:end].contiguous() start = end end += self.num_dir_bins dir_res_norm = reg_preds_trans[..., start:end].contiguous() start = end results['dir_res_norm'] = dir_res_norm results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins) # decode size end += self.num_sizes results['size_class'] = reg_preds_trans[..., start:end].contiguous() start = end end += self.num_sizes * 3 size_res_norm = reg_preds_trans[..., start:end] batch_size, num_proposal = reg_preds_trans.shape[:2] size_res_norm = size_res_norm.view( [batch_size, num_proposal, self.num_sizes, 3]) start = end results['size_res_norm'] = size_res_norm.contiguous() mean_sizes = reg_preds.new_tensor(self.mean_sizes) results['size_res'] = ( size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0)) # decode objectness score start = 0 end = 2 results['obj_scores'] = cls_preds_trans[..., start:end].contiguous() start = end # decode semantic score results['sem_scores'] = cls_preds_trans[..., start:].contiguous() return results def angle2class(self, angle): """Convert continuous angle to a discrete class and a residual. Convert continuous angle to a discrete class and a small regression number from class center angle to current angle. Args: angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi), class center at 0, 1*(2pi/N), 2*(2pi/N) ... (N-1)*(2pi/N). Returns: tuple: Encoded discrete class and residual. """ angle = angle % (2 * np.pi) angle_per_class = 2 * np.pi / float(self.num_dir_bins) shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) angle_cls = shifted_angle // angle_per_class angle_res = shifted_angle - ( angle_cls * angle_per_class + angle_per_class / 2) return angle_cls.long(), angle_res def class2angle(self, angle_cls, angle_res, limit_period=True): """Inverse function to angle2class. Args: angle_cls (torch.Tensor): Angle class to decode. angle_res (torch.Tensor): Angle residual to decode. limit_period (bool): Whether to limit angle to [-pi, pi]. Returns: torch.Tensor: Angle decoded from angle_cls and angle_res. """ angle_per_class = 2 * np.pi / float(self.num_dir_bins) angle_center = angle_cls.float() * angle_per_class angle = angle_center + angle_res if limit_period: angle[angle > np.pi] -= 2 * np.pi return angle ================================================ FILE: mmdet3d/core/bbox/coders/transfusion_bbox_coder.py ================================================ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class TransFusionBBoxCoder(BaseBBoxCoder): def __init__(self, pc_range, out_size_factor, voxel_size, post_center_range=None, score_threshold=None, code_size=8, ): self.pc_range = pc_range self.out_size_factor = out_size_factor self.voxel_size = voxel_size self.post_center_range = post_center_range self.score_threshold = score_threshold self.code_size = code_size def encode(self, dst_boxes): targets = torch.zeros([dst_boxes.shape[0], self.code_size]).to(dst_boxes.device) targets[:, 0] = (dst_boxes[:, 0] - self.pc_range[0]) / (self.out_size_factor * self.voxel_size[0]) targets[:, 1] = (dst_boxes[:, 1] - self.pc_range[1]) / (self.out_size_factor * self.voxel_size[1]) # targets[:, 2] = (dst_boxes[:, 2] - self.post_center_range[2]) / (self.post_center_range[5] - self.post_center_range[2]) targets[:, 3] = dst_boxes[:, 3].log() targets[:, 4] = dst_boxes[:, 4].log() targets[:, 5] = dst_boxes[:, 5].log() targets[:, 2] = dst_boxes[:, 2] + dst_boxes[:, 5] * 0.5 # bottom center to gravity center targets[:, 6] = torch.sin(dst_boxes[:, 6]) targets[:, 7] = torch.cos(dst_boxes[:, 6]) if self.code_size == 10: targets[:, 8:10] = dst_boxes[:, 7:] return targets def decode(self, heatmap, rot, dim, center, height, vel, filter=False): """Decode bboxes. Args: heat (torch.Tensor): Heatmap with the shape of [B, num_cls, num_proposals]. rot (torch.Tensor): Rotation with the shape of [B, 1, num_proposals]. dim (torch.Tensor): Dim of the boxes with the shape of [B, 3, num_proposals]. center (torch.Tensor): bev center of the boxes with the shape of [B, 2, num_proposals]. (in feature map metric) hieght (torch.Tensor): height of the boxes with the shape of [B, 2, num_proposals]. (in real world metric) vel (torch.Tensor): Velocity with the shape of [B, 2, num_proposals]. filter: if False, return all box without checking score and center_range Returns: list[dict]: Decoded boxes. """ # class label final_preds = heatmap.max(1, keepdims=False).indices final_scores = heatmap.max(1, keepdims=False).values # change size to real world metric center[:, 0, :] = center[:, 0, :] * self.out_size_factor * self.voxel_size[0] + self.pc_range[0] center[:, 1, :] = center[:, 1, :] * self.out_size_factor * self.voxel_size[1] + self.pc_range[1] # center[:, 2, :] = center[:, 2, :] * (self.post_center_range[5] - self.post_center_range[2]) + self.post_center_range[2] dim[:, 0, :] = dim[:, 0, :].exp() dim[:, 1, :] = dim[:, 1, :].exp() dim[:, 2, :] = dim[:, 2, :].exp() height = height - dim[:, 2:3, :] * 0.5 # gravity center to bottom center rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :] rot = torch.atan2(rots, rotc) if vel is None: final_box_preds = torch.cat([center, height, dim, rot], dim=1).permute(0, 2, 1) else: final_box_preds = torch.cat([center, height, dim, rot, vel], dim=1).permute(0, 2, 1) predictions_dicts = [] for i in range(heatmap.shape[0]): boxes3d = final_box_preds[i] scores = final_scores[i] labels = final_preds[i] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels } predictions_dicts.append(predictions_dict) if filter is False: return predictions_dicts # use score threshold if self.score_threshold is not None: thresh_mask = final_scores > self.score_threshold if self.post_center_range is not None: self.post_center_range = torch.tensor( self.post_center_range, device=heatmap.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(2) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(2) predictions_dicts = [] for i in range(heatmap.shape[0]): cmask = mask[i, :] if self.score_threshold: cmask &= thresh_mask[i] boxes3d = final_box_preds[i, cmask] scores = final_scores[i, cmask] labels = final_preds[i, cmask] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels, 'cmask': cmask } predictions_dicts.append(predictions_dict) else: raise NotImplementedError( 'Need to reorganize output as a batch, only ' 'support post_center_range is not None for now!') return predictions_dicts ================================================ FILE: mmdet3d/core/bbox/iou_calculators/__init__.py ================================================ from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, bbox_overlaps_nearest_3d) __all__ = [ 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d' ] ================================================ FILE: mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py ================================================ import torch from mmdet.core.bbox import bbox_overlaps from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS from ..structures import get_box_type @IOU_CALCULATORS.register_module() class BboxOverlapsNearest3D(object): """Nearest 3D IoU Calculator. Note: This IoU calculator first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. Args: coordinate (str): 'camera', 'lidar', or 'depth' coordinate system. """ def __init__(self, coordinate='lidar'): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate nearest 3D IoU. Note: If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise it calculates the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, h, w, l, ry, v]. bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, h, w, l, ry, v]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned. Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between \ bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \ ``False``, return shape is M. """ return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned, self.coordinate) def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str @IOU_CALCULATORS.register_module() class BboxOverlaps3D(object): """3D IoU Calculator. Args: coordinate (str): The coordinate system, valid options are 'camera', 'lidar', and 'depth'. """ def __init__(self, coordinate): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou'): """Calculate 3D IoU using cuda implementation. Note: This function calculate the IoU of 3D boxes based on their volumes. IoU calculator ``:class:BboxOverlaps3D`` uses this function to calculate the actual 3D IoUs of boxes. Args: bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry]. bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \ with shape (M, N) (aligned mode is not supported currently). """ return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate) def __repr__(self): """str: return a string that describes the module""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, coordinate='lidar'): """Calculate nearest 3D IoU. Note: This function first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. Ths IoU calculator :class:`BboxOverlapsNearest3D` uses this function to calculate IoUs of boxes. If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry, v]. bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry, v]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between \ bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \ ``False``, return shape is M. """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) # Change the bboxes to bev # box conversion and iou calculation in torch version on CUDA # is 10x faster than that in numpy version bboxes1_bev = bboxes1.nearest_bev bboxes2_bev = bboxes2.nearest_bev ret = bbox_overlaps( bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned) return ret def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'): """Calculate 3D IoU using cuda implementation. Note: This function calculates the IoU of 3D boxes based on their volumes. IoU calculator :class:`BboxOverlaps3D` uses this function to calculate the actual IoUs of boxes. Args: bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry]. bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). coordinate (str): 'camera' or 'lidar' coordinate system. Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \ with shape (M, N) (aligned mode is not supported currently). """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) return bboxes1.overlaps(bboxes1, bboxes2, mode=mode) @IOU_CALCULATORS.register_module() class AxisAlignedBboxOverlaps3D(object): """Axis-aligned 3D Overlaps (IoU) Calculator.""" def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate IoU between 2D bboxes. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned `` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Default False. Returns: Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) """ assert bboxes1.size(-1) == bboxes2.size(-1) == 6 return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode, is_aligned) def __repr__(self): """str: a string describing the module""" repr_str = self.__class__.__name__ + '()' return repr_str def axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): """Calculate overlap between two set of axis aligned 3D bboxes. If ``is_aligned `` is ``False``, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned `` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Default False. eps (float, optional): A value added to the denominator for numerical stability. Default 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) Example: >>> bboxes1 = torch.FloatTensor([ >>> [0, 0, 0, 10, 10, 10], >>> [10, 10, 10, 20, 20, 20], >>> [32, 32, 32, 38, 40, 42], >>> ]) >>> bboxes2 = torch.FloatTensor([ >>> [0, 0, 0, 10, 20, 20], >>> [0, 10, 10, 10, 19, 20], >>> [10, 10, 10, 20, 20, 20], >>> ]) >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2) >>> assert overlaps.shape == (3, 3) >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) >>> assert overlaps.shape == (3, ) Example: >>> empty = torch.empty(0, 6) >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]]) >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) """ assert mode in ['iou', 'giou'], f'Unsupported mode {mode}' # Either the boxes are empty or the length of boxes's last dimenstion is 6 assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0) # Batch dim must be the same # Batch dim: (B1, B2, ... Bn) assert bboxes1.shape[:-2] == bboxes2.shape[:-2] batch_shape = bboxes1.shape[:-2] rows = bboxes1.size(-2) cols = bboxes2.size(-2) if is_aligned: assert rows == cols if rows * cols == 0: if is_aligned: return bboxes1.new(batch_shape + (rows, )) else: return bboxes1.new(batch_shape + (rows, cols)) area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * ( bboxes1[..., 5] - bboxes1[..., 2]) area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * ( bboxes2[..., 5] - bboxes2[..., 2]) if is_aligned: lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3] rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3] wh = (rb - lt).clamp(min=0) # [B, rows, 2] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1 + area2 - overlap else: union = area1 if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3]) enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:]) else: lt = torch.max(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) # [B, rows, cols, 3] rb = torch.min(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) # [B, rows, cols, 3] wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1[..., None] + area2[..., None, :] - overlap if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) enclosed_rb = torch.max(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) eps = union.new_tensor([eps]) union = torch.max(union, eps) ious = overlap / union if mode in ['iou']: return ious # calculate gious enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0) enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2] enclose_area = torch.max(enclose_area, eps) gious = ious - (enclose_area - union) / enclose_area return gious ================================================ FILE: mmdet3d/core/bbox/samplers/__init__.py ================================================ from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler, InstanceBalancedPosSampler, IoUBalancedNegSampler, OHEMSampler, PseudoSampler, RandomSampler, SamplingResult) from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler __all__ = [ 'BaseSampler', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler' ] ================================================ FILE: mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py ================================================ import torch from mmdet.core.bbox.builder import BBOX_SAMPLERS from . import RandomSampler, SamplingResult @BBOX_SAMPLERS.register_module() class IoUNegPiecewiseSampler(RandomSampler): """IoU Piece-wise Sampling. Sampling negtive proposals according to a list of IoU thresholds. The negtive proposals are divided into several pieces according to `neg_iou_piece_thrs`. And the ratio of each piece is indicated by `neg_piece_fractions`. Args: num (int): Number of proposals. pos_fraction (float): The fraction of positive proposals. neg_piece_fractions (list): A list contains fractions that indicates the ratio of each piece of total negtive samplers. neg_iou_piece_thrs (list): A list contains IoU thresholds that indicate the upper bound of this piece. neg_pos_ub (float): The total ratio to limit the upper bound number of negtive samples. add_gt_as_proposals (bool): Whether to add gt as proposals. """ def __init__(self, num, pos_fraction=None, neg_piece_fractions=None, neg_iou_piece_thrs=None, neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=False): super(IoUNegPiecewiseSampler, self).__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals) assert isinstance(neg_piece_fractions, list) assert len(neg_piece_fractions) == len(neg_iou_piece_thrs) self.neg_piece_fractions = neg_piece_fractions self.neg_iou_thr = neg_iou_piece_thrs self.return_iou = return_iou self.neg_piece_num = len(self.neg_piece_fractions) def _sample_pos(self, assign_result, num_expected, **kwargs): """Randomly sample some positive samples.""" pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False) if pos_inds.numel() != 0: pos_inds = pos_inds.squeeze(1) if pos_inds.numel() <= num_expected: return pos_inds else: return self.random_choice(pos_inds, num_expected) def _sample_neg(self, assign_result, num_expected, **kwargs): """Randomly sample some negative samples.""" neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) if len(neg_inds) <= num_expected: return neg_inds else: neg_inds_choice = neg_inds.new_zeros([0]) extend_num = 0 max_overlaps = assign_result.max_overlaps[neg_inds] for piece_inds in range(self.neg_piece_num): if piece_inds == self.neg_piece_num - 1: # for the last piece piece_expected_num = num_expected - len(neg_inds_choice) min_iou_thr = 0 else: # if the numbers of negative samplers in previous # pieces are less than the expected number, extend # the same number in the current piece. piece_expected_num = int( num_expected * self.neg_piece_fractions[piece_inds]) + extend_num min_iou_thr = self.neg_iou_thr[piece_inds + 1] max_iou_thr = self.neg_iou_thr[piece_inds] piece_neg_inds = torch.nonzero( (max_overlaps >= min_iou_thr) & (max_overlaps < max_iou_thr), as_tuple=False).view(-1) if len(piece_neg_inds) < piece_expected_num: neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0) extend_num += piece_expected_num - len(piece_neg_inds) else: piece_choice = self.random_choice(piece_neg_inds, piece_expected_num) neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_choice]], dim=0) extend_num = 0 return neg_inds_choice def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs): """Sample positive and negative bboxes. This is a simple implementation of bbox sampling given candidates, assigning results and ground truth bboxes. Args: assign_result (:obj:`AssignResult`): Bbox assigning results. bboxes (torch.Tensor): Boxes to be sampled from. gt_bboxes (torch.Tensor): Ground truth bboxes. gt_labels (torch.Tensor, optional): Class labels of ground truth \ bboxes. Returns: :obj:`SamplingResult`: Sampling result. """ if len(bboxes.shape) < 2: bboxes = bboxes[None, :] gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool) if self.add_gt_as_proposals and len(gt_bboxes) > 0: if gt_labels is None: raise ValueError( 'gt_labels must be given when add_gt_as_proposals is True') bboxes = torch.cat([gt_bboxes, bboxes], dim=0) assign_result.add_gt_(gt_labels) gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool) gt_flags = torch.cat([gt_ones, gt_flags]) num_expected_pos = int(self.num * self.pos_fraction) pos_inds = self.pos_sampler._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs) # We found that sampled indices have duplicated items occasionally. # (may be a bug of PyTorch) pos_inds = pos_inds.unique() num_sampled_pos = pos_inds.numel() num_expected_neg = self.num - num_sampled_pos if self.neg_pos_ub >= 0: _pos = max(1, num_sampled_pos) neg_upper_bound = int(self.neg_pos_ub * _pos) if num_expected_neg > neg_upper_bound: num_expected_neg = neg_upper_bound neg_inds = self.neg_sampler._sample_neg( assign_result, num_expected_neg, bboxes=bboxes, **kwargs) neg_inds = neg_inds.unique() sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) if self.return_iou: # PartA2 needs iou score to regression. sampling_result.iou = assign_result.max_overlaps[torch.cat( [pos_inds, neg_inds])] sampling_result.iou.detach_() return sampling_result ================================================ FILE: mmdet3d/core/bbox/structures/__init__.py ================================================ from .base_box3d import BaseInstance3DBoxes from .box_3d_mode import Box3DMode from .cam_box3d import CameraInstance3DBoxes from .coord_3d_mode import Coord3DMode from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes from .utils import (get_box_type, limit_period, points_cam2img, rotation_3d_in_axis, xywhr2xyxyr) __all__ = [ 'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img', 'Coord3DMode' ] ================================================ FILE: mmdet3d/core/bbox/structures/base_box3d.py ================================================ import numpy as np import torch from abc import abstractmethod from mmdet3d.ops.iou3d import iou3d_cuda from .utils import limit_period, xywhr2xyxyr class BaseInstance3DBoxes(object): """Base class for 3D Boxes. Note: The box is bottom centered, i.e. the relative position of origin in the box is (0.5, 0.5, 0). Args: tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix. box_dim (int): Number of the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw). Default to 7. with_yaw (bool): Whether the box is with yaw rotation. If False, the value of yaw will be set to 0 as minmax boxes. Default to True. origin (tuple[float]): The relative position of origin in the box. Default to (0.5, 0.5, 0). This will guide the box be converted to (0.5, 0.5, 0) mode. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicating the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, box_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding # 0 as a fake yaw and set with_yaw to False. assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 1) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 1 self.with_yaw = False else: self.box_dim = box_dim self.with_yaw = with_yaw self.tensor = tensor.clone() if origin != (0.5, 0.5, 0): dst = self.tensor.new_tensor((0.5, 0.5, 0)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) @property def volume(self): """torch.Tensor: A vector with volume of each box.""" return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5] @property def dims(self): """torch.Tensor: Corners of each box with size (N, 8, 3).""" return self.tensor[:, 3:6] @property def yaw(self): """torch.Tensor: A vector with yaw of each box.""" return self.tensor[:, 6] @property def height(self): """torch.Tensor: A vector with height of each box.""" return self.tensor[:, 5] @property def top_height(self): """torch.Tensor: A vector with the top height of each box.""" return self.bottom_height + self.height @property def bottom_height(self): """torch.Tensor: A vector with bottom's height of each box.""" return self.tensor[:, 2] @property def center(self): """Calculate the center of all the boxes. Note: In the MMDetection3D's convention, the bottom center is usually taken as the default center. The relative position of the centers in different kinds of boxes are different, e.g., the relative center of a boxes is (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is recommended to use ``bottom_center`` or ``gravity_center`` for more clear usage. Returns: torch.Tensor: A tensor with center of each box. """ return self.bottom_center @property def bottom_center(self): """torch.Tensor: A tensor with center of each box.""" return self.tensor[:, :3] @property def gravity_center(self): """torch.Tensor: A tensor with center of each box.""" pass @property def corners(self): """torch.Tensor: a tensor with 8 corners of each box.""" pass @abstractmethod def rotate(self, angles, axis=0): """Calculate whether the points are in any of the boxes. Args: angles (float): Rotation angles. axis (int): The axis to rotate the boxes. """ pass @abstractmethod def flip(self, bev_direction='horizontal'): """Flip the boxes in BEV along given BEV direction.""" pass def translate(self, trans_vector): """Calculate whether the points are in any of the boxes. Args: trans_vector (torch.Tensor): Translation vector of size 1x3. """ if not isinstance(trans_vector, torch.Tensor): trans_vector = self.tensor.new_tensor(trans_vector) self.tensor[:, :3] += trans_vector def in_range_3d(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): The range of box (x_min, y_min, z_min, x_max, y_max, z_max) Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: torch.Tensor: A binary vector indicating whether each box is \ inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 1] > box_range[1]) & (self.tensor[:, 2] > box_range[2]) & (self.tensor[:, 0] < box_range[3]) & (self.tensor[:, 1] < box_range[4]) & (self.tensor[:, 2] < box_range[5])) return in_range_flags @abstractmethod def in_range_bev(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): The range of box in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each box is inside \ the reference range. """ pass @abstractmethod def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`BoxMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: The converted box of the same type \ in the `dst` mode. """ pass def scale(self, scale_factor): """Scale the box with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the boxes. """ self.tensor[:, :6] *= scale_factor self.tensor[:, 7:] *= scale_factor def limit_yaw(self, offset=0.5, period=np.pi): """Limit the yaw to a given period and offset. Args: offset (float): The offset of the yaw. period (float): The expected period. """ self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period) def nonempty(self, threshold: float = 0.0): """Find boxes that are non-empty. A box is considered empty, if either of its side is no larger than threshold. Args: threshold (float): The threshold of minimal sizes. Returns: torch.Tensor: A binary vector which represents whether each \ box is empty (False) or non-empty (True). """ box = self.tensor size_x = box[..., 3] size_y = box[..., 4] size_z = box[..., 5] keep = ((size_x > threshold) & (size_y > threshold) & (size_z > threshold)) return keep def __getitem__(self, item): """ Note: The following usage are allowed: 1. `new_boxes = boxes[3]`: return a `Boxes` that contains only one box. 2. `new_boxes = boxes[2:10]`: return a slice of boxes. 3. `new_boxes = boxes[vector]`: where vector is a torch.BoolTensor with `length = len(boxes)`. Nonzero elements in the vector will be selected. Note that the returned Boxes might share storage with this Boxes, subject to Pytorch's indexing semantics. Returns: :obj:`BaseInstances3DBoxes`: A new object of \ :class:`BaseInstances3DBoxes` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type( self.tensor[item].view(1, -1), box_dim=self.box_dim, with_yaw=self.with_yaw) b = self.tensor[item] assert b.dim() == 2, \ f'Indexing on Boxes with {item} failed to return a matrix!' return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw) def __len__(self): """int: Number of boxes in the current object.""" return self.tensor.shape[0] def __repr__(self): """str: Return a strings that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, boxes_list): """Concatenate a list of Boxes into a single Boxes. Args: boxes_list (list[:obj:`BaseInstances3DBoxes`]): List of boxes. Returns: :obj:`BaseInstances3DBoxes`: The concatenated Boxes. """ assert isinstance(boxes_list, (list, tuple)) if len(boxes_list) == 0: return cls(torch.empty(0)) assert all(isinstance(box, cls) for box in boxes_list) # use torch.cat (v.s. layers.cat) # so the returned boxes never share storage with input cat_boxes = cls( torch.cat([b.tensor for b in boxes_list], dim=0), box_dim=boxes_list[0].tensor.shape[1], with_yaw=boxes_list[0].with_yaw) return cat_boxes def to(self, device): """Convert current boxes to a specific device. Args: device (str | :obj:`torch.device`): The name of the device. Returns: :obj:`BaseInstance3DBoxes`: A new boxes object on the \ specific device. """ original_type = type(self) return original_type( self.tensor.to(device), box_dim=self.box_dim, with_yaw=self.with_yaw) def clone(self): """Clone the Boxes. Returns: :obj:`BaseInstance3DBoxes`: Box object with the same properties \ as self. """ original_type = type(self) return original_type( self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw) @property def device(self): """str: The device of the boxes are on.""" return self.tensor.device def __iter__(self): """Yield a box as a Tensor of shape (4,) at a time. Returns: torch.Tensor: A box of shape (4,). """ yield from self.tensor @classmethod def height_overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate height overlaps of two boxes. Note: This function calculates the height overlaps between boxes1 and boxes2, boxes1 and boxes2 should be in the same type. Args: boxes1 (:obj:`BaseInstanceBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstanceBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of iou calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes. """ assert isinstance(boxes1, BaseInstance3DBoxes) assert isinstance(boxes2, BaseInstance3DBoxes) assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' boxes1_top_height = boxes1.top_height.view(-1, 1) boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) heighest_of_bottom = torch.max(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height) overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0) return overlaps_h @classmethod def overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate 3D overlaps of two boxes. Note: This function calculates the overlaps between ``boxes1`` and ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`BaseInstanceBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstanceBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of iou calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes' heights. """ assert isinstance(boxes1, BaseInstance3DBoxes) assert isinstance(boxes2, BaseInstance3DBoxes) assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' assert mode in ['iou', 'iof'] rows = len(boxes1) cols = len(boxes2) if rows * cols == 0: return boxes1.tensor.new(rows, cols) # height overlap overlaps_h = cls.height_overlaps(boxes1, boxes2) # obtain BEV boxes in XYXYR format boxes1_bev = xywhr2xyxyr(boxes1.bev) boxes2_bev = xywhr2xyxyr(boxes2.bev) # bev overlap overlaps_bev = boxes1_bev.new_zeros( (boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda() # (N, M) iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(), boxes2_bev.contiguous().cuda(), overlaps_bev) # 3d overlaps overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h volume1 = boxes1.volume.view(-1, 1) volume2 = boxes2.volume.view(1, -1) if mode == 'iou': # the clamp func is used to avoid division of 0 iou3d = overlaps_3d / torch.clamp( volume1 + volume2 - overlaps_3d, min=1e-8) else: iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8) return iou3d def new_box(self, data): """Create a new box object with data. The new box and its tensor has the similar properties \ as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, \ the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, torch.Tensor) else data.to(self.device) original_type = type(self) return original_type( new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw) ================================================ FILE: mmdet3d/core/bbox/structures/box_3d_mode.py ================================================ import numpy as np import torch from enum import IntEnum, unique from .base_box3d import BaseInstance3DBoxes from .cam_box3d import CameraInstance3DBoxes from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes @unique class Box3DMode(IntEnum): r"""Enum of different ways to represent a box. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth mode: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 @staticmethod def convert(box, src, dst, rt_mat=None): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`BoxMode`): The src Box mode. dst (:obj:`BoxMode`): The target Box mode. rt_mat (np.dnarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: (tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): \ The converted box of the same type. """ if src == dst: return box is_numpy = isinstance(box, np.ndarray) is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) >= 7, ( 'BoxMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 7') arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() elif is_Instance3DBoxes: arr = box.tensor.clone() else: arr = box.clone() # convert box from `src` mode to `dst` mode. x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) else: raise NotImplementedError( f'Conversion from Box3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, torch.Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[:, :3] @ rt_mat.t() remains = arr[..., 6:] arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) # convert arr to the original type original_type = type(box) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_Instance3DBoxes: if dst == Box3DMode.CAM: target_type = CameraInstance3DBoxes elif dst == Box3DMode.LIDAR: target_type = LiDARInstance3DBoxes elif dst == Box3DMode.DEPTH: target_type = DepthInstance3DBoxes else: raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') return target_type( arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) else: return arr ================================================ FILE: mmdet3d/core/bbox/structures/cam_box3d.py ================================================ import numpy as np import torch from mmdet3d.core.points import BasePoints from .base_box3d import BaseInstance3DBoxes from .utils import limit_period, rotation_3d_in_axis class CameraInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in CAM coordinates. Coordinates in camera: .. code-block:: none z front (yaw=0.5*pi) / / 0 ------> x right (yaw=0) | | v down y The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), and the yaw is around the y axis, thus the rotation axis=1. The yaw is 0 at the positive direction of x axis, and increases from the positive direction of x to the positive direction of z. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicates the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 1.0, 0.5)): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, box_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding # 0 as a fake yaw and set with_yaw to False. assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 1) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 1 self.with_yaw = False else: self.box_dim = box_dim self.with_yaw = with_yaw self.tensor = tensor.clone() if origin != (0.5, 1.0, 0.5): dst = self.tensor.new_tensor((0.5, 1.0, 0.5)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) @property def height(self): """torch.Tensor: A vector with height of each box.""" return self.tensor[:, 4] @property def top_height(self): """torch.Tensor: A vector with the top height of each box.""" # the positive direction is down rather than up return self.bottom_height - self.height @property def bottom_height(self): """torch.Tensor: A vector with bottom's height of each box.""" return self.tensor[:, 1] @property def gravity_center(self): """torch.Tensor: A tensor with center of each box.""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]] gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to in clockwise order, in the form of (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0) .. code-block:: none front z / / (x0, y0, z1) + ----------- + (x1, y0, z1) /| / | / | / | (x0, y0, z0) + ----------- + + (x1, y1, z1) | / . | / | / oriign | / (x0, y1, z0) + ----------- + -------> x right | (x1, y1, z0) | v down y """ # TODO: rotation_3d_in_axis function do not support # empty tensor currently. assert len(self.tensor) != 0 dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin [0.5, 1, 0.5] corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around y axis corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1) corners += self.tensor[:, :3].view(-1, 1, 3) return corners @property def bev(self): """torch.Tensor: A n x 5 tensor of 2D BEV box of each box with rotation in XYWHR format.""" return self.tensor[:, [0, 2, 3, 5, 6]] @property def nearest_bev(self): """torch.Tensor: A tensor of 2D BEV box of each box without rotation.""" # Obtain BEV boxes with rotation in XZWHR format bev_rotated_boxes = self.bev # convert the rotation to a valid range rotations = bev_rotated_boxes[:, -1] normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) # find the center of boxes conditions = (normed_rotations > np.pi / 4)[..., None] bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, [0, 1, 3, 2]], bev_rotated_boxes[:, :4]) centers = bboxes_xywh[:, :2] dims = bboxes_xywh[:, 2:] bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) return bev_boxes def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle. Args: angle (float, torch.Tensor): Rotation angle. points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns \ None, otherwise it returns the rotated points and the \ rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) rot_sin = torch.sin(angle) rot_cos = torch.cos(angle) rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]]) self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T self.tensor[:, 6] += angle if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): # clockwise points.rotate(-angle) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In CAM coordinates, it flips the x (horizontal) or z (vertical) axis. Args: bev_direction (str): Flip direction (horizontal or vertical). points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi elif bev_direction == 'vertical': self.tensor[:, 2::7] = -self.tensor[:, 2::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 0] = -points[:, 0] elif bev_direction == 'vertical': points[:, 2] = -points[:, 2] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def in_range_bev(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): The range of box (x_min, z_min, x_max, z_max). Note: The original implementation of SECOND checks whether boxes in a range by checking whether the points are in a convex polygon, we reduce the burden for simpler cases. Returns: torch.Tensor: Indicating whether each box is inside \ the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 2] > box_range[1]) & (self.tensor[:, 0] < box_range[2]) & (self.tensor[:, 2] < box_range[3])) return in_range_flags @classmethod def height_overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate height overlaps of two boxes. This function calculates the height overlaps between ``boxes1`` and ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of iou calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes' heights. """ assert isinstance(boxes1, CameraInstance3DBoxes) assert isinstance(boxes2, CameraInstance3DBoxes) boxes1_top_height = boxes1.top_height.view(-1, 1) boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) # In camera coordinate system # from up to down is the positive direction heighest_of_bottom = torch.min(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height) overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0) return overlaps_h def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`BoxMode`): The target Box mode. rt_mat (np.dnarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: \ The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/bbox/structures/coord_3d_mode.py ================================================ import numpy as np import torch from enum import IntEnum, unique from mmdet3d.core.points import (BasePoints, CameraPoints, DepthPoints, LiDARPoints) from .base_box3d import BaseInstance3DBoxes from .cam_box3d import CameraInstance3DBoxes from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes @unique class Coord3DMode(IntEnum): r"""Enum of different ways to represent a box and point cloud. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth mode: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 @staticmethod def convert(input, src, dst, rt_mat=None): """Convert boxes or points from `src` mode to `dst` mode.""" if isinstance(input, BaseInstance3DBoxes): return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat) elif isinstance(input, BasePoints): return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) else: raise NotImplementedError @staticmethod def convert_box(box, src, dst, rt_mat=None): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`CoordMode`): The src Box mode. dst (:obj:`CoordMode`): The target Box mode. rt_mat (np.dnarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: (tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): \ The converted box of the same type. """ if src == dst: return box is_numpy = isinstance(box, np.ndarray) is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) >= 7, ( 'CoordMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 7') arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() elif is_Instance3DBoxes: arr = box.tensor.clone() else: arr = box.clone() # convert box from `src` mode to `dst` mode. x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) else: raise NotImplementedError( f'Conversion from Coord3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, torch.Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[:, :3] @ rt_mat.t() remains = arr[..., 6:] arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) # convert arr to the original type original_type = type(box) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_Instance3DBoxes: if dst == Coord3DMode.CAM: target_type = CameraInstance3DBoxes elif dst == Coord3DMode.LIDAR: target_type = LiDARInstance3DBoxes elif dst == Coord3DMode.DEPTH: target_type = DepthInstance3DBoxes else: raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') return target_type( arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) else: return arr @staticmethod def convert_point(point, src, dst, rt_mat=None): """Convert points from `src` mode to `dst` mode. Args: point (tuple | list | np.dnarray | torch.Tensor | BasePoints): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`CoordMode`): The src Point mode. dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.dnarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: (tuple | list | np.dnarray | torch.Tensor | BasePoints): \ The converted point of the same type. """ if src == dst: return point is_numpy = isinstance(point, np.ndarray) is_InstancePoints = isinstance(point, BasePoints) single_point = isinstance(point, (list, tuple)) if single_point: assert len(point) >= 3, ( 'CoordMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 3') arr = torch.tensor(point)[None, :] else: # avoid modifying the input point if is_numpy: arr = torch.from_numpy(np.asarray(point)).clone() elif is_InstancePoints: arr = point.tensor.clone() else: arr = point.clone() # convert point from `src` mode to `dst` mode. # TODO: LIDAR # only implemented provided Rt matrix in cam-depth conversion if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) else: rt_mat = rt_mat.new_tensor( [[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ \ rt_mat.transpose(1, 0) elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) else: rt_mat = rt_mat @ rt_mat.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) else: raise NotImplementedError( f'Conversion from Coord3DMode {src} to {dst} ' 'is not supported yet') if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[:, :3] @ rt_mat.t() remains = arr[:, 3:] arr = torch.cat([xyz[:, :3], remains], dim=-1) # convert arr to the original type original_type = type(point) if single_point: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_InstancePoints: if dst == Coord3DMode.CAM: target_type = CameraPoints elif dst == Coord3DMode.LIDAR: target_type = LiDARPoints elif dst == Coord3DMode.DEPTH: target_type = DepthPoints else: raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') return target_type( arr, points_dim=arr.size(-1), attribute_dims=point.attribute_dims) else: return arr ================================================ FILE: mmdet3d/core/bbox/structures/depth_box3d.py ================================================ import numpy as np import torch from mmdet3d.core.points import BasePoints from mmdet3d.ops import points_in_boxes_batch from .base_box3d import BaseInstance3DBoxes from .utils import limit_period, rotation_3d_in_axis class DepthInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in Depth coordinates. Coordinates in Depth: .. code-block:: none up z y front (yaw=0.5*pi) ^ ^ | / | / 0 ------> x right (yaw=0) The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at the positive direction of x axis, and increases from the positive direction of x to the positive direction of y. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicates the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ @property def gravity_center(self): """torch.Tensor: A tensor with center of each box.""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front y ^ / | / | (x0, y1, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / oriign | / (x0, y0, z0) + ----------- + --------> right x (x1, y0, z0) """ # TODO: rotation_3d_in_axis function do not support # empty tensor currently. assert len(self.tensor) != 0 dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin (0.5, 0.5, 0) corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) corners += self.tensor[:, :3].view(-1, 1, 3) return corners @property def bev(self): """torch.Tensor: A n x 5 tensor of 2D BEV box of each box in XYWHR format.""" return self.tensor[:, [0, 1, 3, 4, 6]] @property def nearest_bev(self): """torch.Tensor: A tensor of 2D BEV box of each box without rotation.""" # Obtain BEV boxes with rotation in XYWHR format bev_rotated_boxes = self.bev # convert the rotation to a valid range rotations = bev_rotated_boxes[:, -1] normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) # find the center of boxes conditions = (normed_rotations > np.pi / 4)[..., None] bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, [0, 1, 3, 2]], bev_rotated_boxes[:, :4]) centers = bboxes_xywh[:, :2] dims = bboxes_xywh[:, 2:] bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) return bev_boxes def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle. Args: angle (float, torch.Tensor): Rotation angle. points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns \ None, otherwise it returns the rotated points and the \ rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) rot_sin = torch.sin(angle) rot_cos = torch.cos(angle) rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]).T self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T if self.with_yaw: self.tensor[:, 6] -= angle else: corners_rot = self.corners @ rot_mat_T new_x_size = corners_rot[..., 0].max( dim=1, keepdim=True)[0] - corners_rot[..., 0].min( dim=1, keepdim=True)[0] new_y_size = corners_rot[..., 1].max( dim=1, keepdim=True)[0] - corners_rot[..., 1].min( dim=1, keepdim=True)[0] self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1) if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): # anti-clockwise points.rotate(angle) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In Depth coordinates, it flips x (horizontal) or y (vertical) axis. Args: bev_direction (str): Flip direction (horizontal or vertical). points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi elif bev_direction == 'vertical': self.tensor[:, 1::7] = -self.tensor[:, 1::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 0] = -points[:, 0] elif bev_direction == 'vertical': points[:, 1] = -points[:, 1] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def in_range_bev(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): The range of box (x_min, y_min, x_max, y_max). Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burdun for simpler cases. Returns: torch.Tensor: Indicating whether each box is inside \ the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 1] > box_range[1]) & (self.tensor[:, 0] < box_range[2]) & (self.tensor[:, 1] < box_range[3])) return in_range_flags def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`BoxMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`DepthInstance3DBoxes`: \ The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat) def points_in_boxes(self, points): """Find points that are in boxes (CUDA). Args: points (torch.Tensor): Points in shape [1, M, 3] or [M, 3], \ 3 dimensions are [x, y, z] in LiDAR coordinate. Returns: torch.Tensor: The index of boxes each point lies in with shape \ of (B, M, T). """ from .box_3d_mode import Box3DMode # to lidar points_lidar = points.clone() points_lidar = points_lidar[..., [1, 0, 2]] points_lidar[..., 1] *= -1 if points.dim() == 2: points_lidar = points_lidar.unsqueeze(0) else: assert points.dim() == 3 and points_lidar.shape[0] == 1 boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0) box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar) return box_idxs_of_pts.squeeze(0) def enlarged_box(self, extra_width): """Enlarge the length, width and height boxes. Args: extra_width (float | torch.Tensor): Extra width to enlarge the box. Returns: :obj:`LiDARInstance3DBoxes`: Enlarged boxes. """ enlarged_boxes = self.tensor.clone() enlarged_boxes[:, 3:6] += extra_width * 2 # bottom center z minus extra_width enlarged_boxes[:, 2] -= extra_width return self.new_box(enlarged_boxes) def get_surface_line_center(self): """Compute surface and line center of bounding boxes. Returns: torch.Tensor: Surface and line center of bounding boxes. """ obj_size = self.dims center = self.gravity_center.view(-1, 1, 3) batch_size = center.shape[0] rot_sin = torch.sin(-self.yaw) rot_cos = torch.cos(-self.yaw) rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3])) rot_mat_T[..., 0, 0] = rot_cos rot_mat_T[..., 0, 1] = -rot_sin rot_mat_T[..., 1, 0] = rot_sin rot_mat_T[..., 1, 1] = rot_cos rot_mat_T[..., 2, 2] = 1 # Get the object surface center offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0], [0, -1, 0], [1, 0, 0], [-1, 0, 0]]) offset = offset.view(1, 6, 3) / 2 surface_3d = (offset * obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape( -1, 3) # Get the object line center offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1], [0, -1, 1], [1, 0, -1], [-1, 0, -1], [0, 1, -1], [0, -1, -1], [1, 1, 0], [1, -1, 0], [-1, 1, 0], [-1, -1, 0]]) offset = offset.view(1, 12, 3) / 2 line_3d = (offset * obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape( -1, 3) surface_rot = rot_mat_T.repeat(6, 1, 1) surface_3d = torch.matmul( surface_3d.unsqueeze(-2), surface_rot.transpose(2, 1)).squeeze(-2) surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d line_rot = rot_mat_T.repeat(12, 1, 1) line_3d = torch.matmul( line_3d.unsqueeze(-2), line_rot.transpose(2, 1)).squeeze(-2) line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d return surface_center, line_center ================================================ FILE: mmdet3d/core/bbox/structures/lidar_box3d.py ================================================ import numpy as np import torch from mmdet3d.core.points import BasePoints from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu from .base_box3d import BaseInstance3DBoxes from .utils import limit_period, rotation_3d_in_axis class LiDARInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in LIDAR coordinates. Coordinates in LiDAR: .. code-block:: none up z x front (yaw=0.5*pi) ^ ^ | / | / (yaw=pi) left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at the negative direction of y axis, and increases from the negative direction of y to the positive direction of x. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicating the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ @property def gravity_center(self): """torch.Tensor: A tensor with center of each box.""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front x ^ / | / | (x1, y0, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / oriign | / left y<-------- + ----------- + (x0, y1, z0) (x0, y0, z0) """ # TODO: rotation_3d_in_axis function do not support # empty tensor currently. assert len(self.tensor) != 0 dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin [0.5, 0.5, 0] corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) corners += self.tensor[:, :3].view(-1, 1, 3) return corners @property def bev(self): """torch.Tensor: 2D BEV box of each box with rotation in XYWHR format.""" return self.tensor[:, [0, 1, 3, 4, 6]] @property def nearest_bev(self): """torch.Tensor: A tensor of 2D BEV box of each box without rotation.""" # Obtain BEV boxes with rotation in XYWHR format bev_rotated_boxes = self.bev # convert the rotation to a valid range rotations = bev_rotated_boxes[:, -1] normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) # find the center of boxes conditions = (normed_rotations > np.pi / 4)[..., None] bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, [0, 1, 3, 2]], bev_rotated_boxes[:, :4]) centers = bboxes_xywh[:, :2] dims = bboxes_xywh[:, 2:] bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) return bev_boxes def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle. Args: angle (float | torch.Tensor): Rotation angle. points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns \ None, otherwise it returns the rotated points and the \ rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) rot_sin = torch.sin(angle) rot_cos = torch.cos(angle) rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T self.tensor[:, 6] += angle if self.tensor.shape[1] == 9: # rotate velo vector self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2] if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): # clockwise points.rotate(-angle) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis. Args: bev_direction (str): Flip direction (horizontal or vertical). points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 1::7] = -self.tensor[:, 1::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi elif bev_direction == 'vertical': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 1] = -points[:, 1] elif bev_direction == 'vertical': points[:, 0] = -points[:, 0] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def in_range_bev(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): the range of box (x_min, y_min, x_max, y_max) Note: The original implementation of SECOND checks whether boxes in a range by checking whether the points are in a convex polygon, we reduce the burden for simpler cases. Returns: torch.Tensor: Whether each box is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 1] > box_range[1]) & (self.tensor[:, 0] < box_range[2]) & (self.tensor[:, 1] < box_range[3])) return in_range_flags def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`BoxMode`): the target Box mode rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: \ The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat) def enlarged_box(self, extra_width): """Enlarge the length, width and height boxes. Args: extra_width (float | torch.Tensor): Extra width to enlarge the box. Returns: :obj:`LiDARInstance3DBoxes`: Enlarged boxes. """ enlarged_boxes = self.tensor.clone() enlarged_boxes[:, 3:6] += extra_width * 2 # bottom center z minus extra_width enlarged_boxes[:, 2] -= extra_width return self.new_box(enlarged_boxes) def points_in_boxes(self, points): """Find the box which the points are in. Args: points (torch.Tensor): Points in shape (N, 3). Returns: torch.Tensor: The index of box where each point are in. """ box_idx = points_in_boxes_gpu( points.unsqueeze(0), self.tensor.unsqueeze(0).to(points.device)).squeeze(0) return box_idx ================================================ FILE: mmdet3d/core/bbox/structures/utils.py ================================================ import numpy as np import torch def limit_period(val, offset=0.5, period=np.pi): """Limit the value into a period for periodic function. Args: val (torch.Tensor): The value to be converted. offset (float, optional): Offset to set the value range. \ Defaults to 0.5. period ([type], optional): Period of the value. Defaults to np.pi. Returns: torch.Tensor: Value in the range of \ [-offset * period, (1-offset) * period] """ return val - torch.floor(val / period + offset) * period def rotation_3d_in_axis(points, angles, axis=0): """Rotate points by angles according to axis. Args: points (torch.Tensor): Points of shape (N, M, 3). angles (torch.Tensor): Vector of angles in shape (N,) axis (int, optional): The axis to be rotated. Defaults to 0. Raises: ValueError: when the axis is not in range [0, 1, 2], it will \ raise value error. Returns: torch.Tensor: Rotated points in shape (N, M, 3) """ rot_sin = torch.sin(angles) rot_cos = torch.cos(angles) ones = torch.ones_like(rot_cos) zeros = torch.zeros_like(rot_cos) if axis == 1: rot_mat_T = torch.stack([ torch.stack([rot_cos, zeros, -rot_sin]), torch.stack([zeros, ones, zeros]), torch.stack([rot_sin, zeros, rot_cos]) ]) elif axis == 2 or axis == -1: rot_mat_T = torch.stack([ torch.stack([rot_cos, -rot_sin, zeros]), torch.stack([rot_sin, rot_cos, zeros]), torch.stack([zeros, zeros, ones]) ]) elif axis == 0: rot_mat_T = torch.stack([ torch.stack([zeros, rot_cos, -rot_sin]), torch.stack([zeros, rot_sin, rot_cos]), torch.stack([ones, zeros, zeros]) ]) else: raise ValueError(f'axis should in range [0, 1, 2], got {axis}') return torch.einsum('aij,jka->aik', (points, rot_mat_T)) def xywhr2xyxyr(boxes_xywhr): """Convert a rotated boxes in XYWHR format to XYXYR format. Args: boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format. Returns: torch.Tensor: Converted boxes in XYXYR format. """ boxes = torch.zeros_like(boxes_xywhr) half_w = boxes_xywhr[:, 2] / 2 half_h = boxes_xywhr[:, 3] / 2 boxes[:, 0] = boxes_xywhr[:, 0] - half_w boxes[:, 1] = boxes_xywhr[:, 1] - half_h boxes[:, 2] = boxes_xywhr[:, 0] + half_w boxes[:, 3] = boxes_xywhr[:, 1] + half_h boxes[:, 4] = boxes_xywhr[:, 4] return boxes def get_box_type(box_type): """Get the type and mode of box structure. Args: box_type (str): The type of box structure. The valid value are "LiDAR", "Camera", or "Depth". Returns: tuple: Box type and box mode. """ from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes) box_type_lower = box_type.lower() if box_type_lower == 'lidar': box_type_3d = LiDARInstance3DBoxes box_mode_3d = Box3DMode.LIDAR elif box_type_lower == 'camera': box_type_3d = CameraInstance3DBoxes box_mode_3d = Box3DMode.CAM elif box_type_lower == 'depth': box_type_3d = DepthInstance3DBoxes box_mode_3d = Box3DMode.DEPTH else: raise ValueError('Only "box_type" of "camera", "lidar", "depth"' f' are supported, got {box_type}') return box_type_3d, box_mode_3d def points_cam2img(points_3d, proj_mat): """Project points from camera coordicates to image coordinates. Args: points_3d (torch.Tensor): Points in shape (N, 3) proj_mat (torch.Tensor): Transformation matrix between coordinates. Returns: torch.Tensor: Points in image coordinates with shape [N, 2]. """ points_num = list(points_3d.shape)[:-1] points_shape = np.concatenate([points_num, [1]], axis=0).tolist() assert len(proj_mat.shape) == 2, f'The dimension of the projection'\ f'matrix should be 2 instead of {len(proj_mat.shape)}.' d1, d2 = proj_mat.shape[:2] assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( d1 == 4 and d2 == 4), f'The shape of the projection matrix'\ f' ({d1}*{d2}) is not supported.' if d1 == 3: proj_mat_expanded = torch.eye( 4, device=proj_mat.device, dtype=proj_mat.dtype) proj_mat_expanded[:d1, :d2] = proj_mat proj_mat = proj_mat_expanded # previous implementation use new_zeros, new_one yeilds better results points_4 = torch.cat( [points_3d, points_3d.new_ones(*points_shape)], dim=-1) point_2d = torch.matmul(points_4, proj_mat.t()) point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] return point_2d_res ================================================ FILE: mmdet3d/core/bbox/transforms.py ================================================ import torch def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical): """Map bboxes from testing scale to original image scale. Args: bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. scale_factor (float): Scale factor. flip_horizontal (bool): Whether to flip horizontally. flip_vertical (bool): Whether to flip vertically. Returns: :obj:`BaseInstance3DBoxes`: Boxes mapped back. """ new_bboxes = bboxes.clone() if flip_horizontal: new_bboxes.flip('horizontal') if flip_vertical: new_bboxes.flip('vertical') new_bboxes.scale(1 / scale_factor) return new_bboxes def bbox3d2roi(bbox_list): """Convert a list of bounding boxes to roi format. Args: bbox_list (list[torch.Tensor]): A list of bounding boxes corresponding to a batch of images. Returns: torch.Tensor: Region of interests in shape (n, c), where \ the channels are in order of [batch_ind, x, y ...]. """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes], dim=-1) else: rois = torch.zeros_like(bboxes) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois def bbox3d2result(bboxes, scores, labels): """Convert detection results to a list of numpy arrays. Args: bboxes (torch.Tensor): Bounding boxes with shape of (n, 5). labels (torch.Tensor): Labels with shape of (n, ). scores (torch.Tensor): Scores with shape of (n, ). Returns: dict[str, torch.Tensor]: Bounding box results in cpu mode. - boxes_3d (torch.Tensor): 3D boxes. - scores (torch.Tensor): Prediction scores. - labels_3d (torch.Tensor): Box labels. """ return dict( boxes_3d=bboxes.to('cpu'), scores_3d=scores.cpu(), labels_3d=labels.cpu()) ================================================ FILE: mmdet3d/core/evaluation/__init__.py ================================================ from .indoor_eval import indoor_eval from .kitti_utils import kitti_eval, kitti_eval_coco_style from .lyft_eval import lyft_eval from .seg_eval import seg_eval __all__ = [ 'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval', 'seg_eval' ] ================================================ FILE: mmdet3d/core/evaluation/indoor_eval.py ================================================ import numpy as np import torch from mmcv.utils import print_log from terminaltables import AsciiTable def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) \ or (num_dets, ). precisions (np.ndarray): Precisions with shape of \ (num_scales, num_dets) or (num_dets, ). mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float or np.ndarray: Calculated average precision. """ if recalls.ndim == 1: recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape assert recalls.ndim == 2 num_scales = recalls.shape[0] ap = np.zeros(num_scales, dtype=np.float32) if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) for i in range(num_scales): ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] ap[i] = np.sum( (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) elif mode == '11points': for i in range(num_scales): for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[i, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap[i] += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap def eval_det_cls(pred, gt, iou_thr=None): """Generic functions to compute precision/recall for object detection for a single class. Args: pred (dict): Predictions mapping from image id to bounding boxes \ and scores. gt (dict): Ground truths mapping from image id to bounding boxes. iou_thr (list[float]): A list of iou thresholds. Return: tuple (np.ndarray, np.ndarray, float): Recalls, precisions and \ average precision. """ # {img_id: {'bbox': box structure, 'det': matched list}} class_recs = {} npos = 0 for img_id in gt.keys(): cur_gt_num = len(gt[img_id]) if cur_gt_num != 0: gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32) for i in range(cur_gt_num): gt_cur[i] = gt[img_id][i].tensor bbox = gt[img_id][0].new_box(gt_cur) else: bbox = gt[img_id] det = [[False] * len(bbox) for i in iou_thr] npos += len(bbox) class_recs[img_id] = {'bbox': bbox, 'det': det} # construct dets image_ids = [] confidence = [] ious = [] for img_id in pred.keys(): cur_num = len(pred[img_id]) if cur_num == 0: continue pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32) box_idx = 0 for box, score in pred[img_id]: image_ids.append(img_id) confidence.append(score) pred_cur[box_idx] = box.tensor box_idx += 1 pred_cur = box.new_box(pred_cur) gt_cur = class_recs[img_id]['bbox'] if len(gt_cur) > 0: # calculate iou in each image iou_cur = pred_cur.overlaps(pred_cur, gt_cur) for i in range(cur_num): ious.append(iou_cur[i]) else: for i in range(cur_num): ious.append(np.zeros(1)) confidence = np.array(confidence) # sort by confidence sorted_ind = np.argsort(-confidence) image_ids = [image_ids[x] for x in sorted_ind] ious = [ious[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp_thr = [np.zeros(nd) for i in iou_thr] fp_thr = [np.zeros(nd) for i in iou_thr] for d in range(nd): R = class_recs[image_ids[d]] iou_max = -np.inf BBGT = R['bbox'] cur_iou = ious[d] if len(BBGT) > 0: # compute overlaps for j in range(len(BBGT)): # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...])) iou = cur_iou[j] if iou > iou_max: iou_max = iou jmax = j for iou_idx, thresh in enumerate(iou_thr): if iou_max > thresh: if not R['det'][iou_idx][jmax]: tp_thr[iou_idx][d] = 1. R['det'][iou_idx][jmax] = 1 else: fp_thr[iou_idx][d] = 1. else: fp_thr[iou_idx][d] = 1. ret = [] for iou_idx, thresh in enumerate(iou_thr): # compute precision recall fp = np.cumsum(fp_thr[iou_idx]) tp = np.cumsum(tp_thr[iou_idx]) recall = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = average_precision(recall, precision) ret.append((recall, precision, ap)) return ret def eval_map_recall(pred, gt, ovthresh=None): """Evaluate mAP and recall. Generic functions to compute precision/recall for object detection for multiple classes. Args: pred (dict): Information of detection results, which maps class_id and predictions. gt (dict): Information of ground truths, which maps class_id and \ ground truths. ovthresh (list[float]): iou threshold. Default: None. Return: tuple[dict]: dict results of recall, AP, and precision for all classes. """ ret_values = {} for classname in gt.keys(): if classname in pred: ret_values[classname] = eval_det_cls(pred[classname], gt[classname], ovthresh) recall = [{} for i in ovthresh] precision = [{} for i in ovthresh] ap = [{} for i in ovthresh] for label in gt.keys(): for iou_idx, thresh in enumerate(ovthresh): if label in pred: recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][ label] = ret_values[label][iou_idx] else: recall[iou_idx][label] = np.zeros(1) precision[iou_idx][label] = np.zeros(1) ap[iou_idx][label] = np.zeros(1) return recall, precision, ap def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None, box_type_3d=None, box_mode_3d=None): """Indoor Evaluation. Evaluate the result of the detection. Args: gt_annos (list[dict]): Ground truth annotations. dt_annos (list[dict]): Detection annotations. the dict includes the following keys - labels_3d (torch.Tensor): Labels of boxes. - boxes_3d (:obj:`BaseInstance3DBoxes`): \ 3D bounding boxes in Depth coordinate. - scores_3d (torch.Tensor): Scores of boxes. metric (list[float]): IoU thresholds for computing average precisions. label2cat (dict): Map from label to category. logger (logging.Logger | str | None): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Return: dict[str, float]: Dict of results. """ assert len(dt_annos) == len(gt_annos) pred = {} # map {class_id: pred} gt = {} # map {class_id: gt} for img_id in range(len(dt_annos)): # parse detected annotations det_anno = dt_annos[img_id] for i in range(len(det_anno['labels_3d'])): label = det_anno['labels_3d'].numpy()[i] bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i] score = det_anno['scores_3d'].numpy()[i] if label not in pred: pred[int(label)] = {} if img_id not in pred[label]: pred[int(label)][img_id] = [] if label not in gt: gt[int(label)] = {} if img_id not in gt[label]: gt[int(label)][img_id] = [] pred[int(label)][img_id].append((bbox, score)) # parse gt annotations gt_anno = gt_annos[img_id] if gt_anno['gt_num'] != 0: gt_boxes = box_type_3d( gt_anno['gt_boxes_upright_depth'], box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d) labels_3d = gt_anno['class'] else: gt_boxes = box_type_3d(np.array([], dtype=np.float32)) labels_3d = np.array([], dtype=np.int64) for i in range(len(labels_3d)): label = labels_3d[i] bbox = gt_boxes[i] if label not in gt: gt[label] = {} if img_id not in gt[label]: gt[label][img_id] = [] gt[label][img_id].append(bbox) rec, prec, ap = eval_map_recall(pred, gt, metric) ret_dict = dict() header = ['classes'] table_columns = [[label2cat[label] for label in ap[0].keys()] + ['Overall']] for i, iou_thresh in enumerate(metric): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') rec_list = [] for label in ap[i].keys(): ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float( ap[i][label][0]) ret_dict[f'mAP_{iou_thresh:.2f}'] = float( np.mean(list(ap[i].values()))) table_columns.append(list(map(float, list(ap[i].values())))) table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] for label in rec[i].keys(): ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float( rec[i][label][-1]) rec_list.append(rec[i][label][-1]) ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list)) table_columns.append(list(map(float, rec_list))) table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return ret_dict ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/__init__.py ================================================ from .eval import kitti_eval, kitti_eval_coco_style __all__ = ['kitti_eval', 'kitti_eval_coco_style'] ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/eval.py ================================================ import gc import io as sysio import numba import numpy as np @numba.jit def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41): scores.sort() scores = scores[::-1] current_recall = 0 thresholds = [] for i, score in enumerate(scores): l_recall = (i + 1) / num_gt if i < (len(scores) - 1): r_recall = (i + 2) / num_gt else: r_recall = l_recall if (((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1))): continue # recall = l_recall thresholds.append(score) current_recall += 1 / (num_sample_pts - 1.0) return thresholds def clean_data(gt_anno, dt_anno, current_class, difficulty): CLASS_NAMES = ['car', 'pedestrian', 'cyclist'] MIN_HEIGHT = [40, 25, 25] MAX_OCCLUSION = [0, 1, 2] MAX_TRUNCATION = [0.15, 0.3, 0.5] dc_bboxes, ignored_gt, ignored_dt = [], [], [] current_cls_name = CLASS_NAMES[current_class].lower() num_gt = len(gt_anno['name']) num_dt = len(dt_anno['name']) num_valid_gt = 0 for i in range(num_gt): bbox = gt_anno['bbox'][i] gt_name = gt_anno['name'][i].lower() height = bbox[3] - bbox[1] valid_class = -1 if (gt_name == current_cls_name): valid_class = 1 elif (current_cls_name == 'Pedestrian'.lower() and 'Person_sitting'.lower() == gt_name): valid_class = 0 elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name): valid_class = 0 else: valid_class = -1 ignore = False if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty]) or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty]) or (height <= MIN_HEIGHT[difficulty])): ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) num_valid_gt += 1 elif (valid_class == 0 or (ignore and (valid_class == 1))): ignored_gt.append(1) else: ignored_gt.append(-1) # for i in range(num_gt): if gt_anno['name'][i] == 'DontCare': dc_bboxes.append(gt_anno['bbox'][i]) for i in range(num_dt): if (dt_anno['name'][i].lower() == current_cls_name): valid_class = 1 else: valid_class = -1 height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1]) if height < MIN_HEIGHT[difficulty]: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes @numba.jit(nopython=True) def image_box_overlap(boxes, query_boxes, criterion=-1): N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0])) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1])) if ih > 0: if criterion == -1: ua = ((boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih) elif criterion == 0: ua = ((boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])) elif criterion == 1: ua = qbox_area else: ua = 1.0 overlaps[n, k] = iw * ih / ua return overlaps def bev_box_overlap(boxes, qboxes, criterion=-1): from .rotate_iou import rotate_iou_gpu_eval riou = rotate_iou_gpu_eval(boxes, qboxes, criterion) return riou @numba.jit(nopython=True, parallel=True) def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1): # ONLY support overlap in CAMERA, not lidar. # TODO: change to use prange for parallel mode, should check the difference N, K = boxes.shape[0], qboxes.shape[0] for i in numba.prange(N): for j in numba.prange(K): if rinc[i, j] > 0: # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) iw = ( min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4])) if iw > 0: area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] inc = iw * rinc[i, j] if criterion == -1: ua = (area1 + area2 - inc) elif criterion == 0: ua = area1 elif criterion == 1: ua = area2 else: ua = inc rinc[i, j] = inc / ua else: rinc[i, j] = 0.0 def d3_box_overlap(boxes, qboxes, criterion=-1): from .rotate_iou import rotate_iou_gpu_eval rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2) d3_box_overlap_kernel(boxes, qboxes, rinc, criterion) return rinc @numba.jit(nopython=True) def compute_statistics_jit(overlaps, gt_datas, dt_datas, ignored_gt, ignored_det, dc_bboxes, metric, min_overlap, thresh=0, compute_fp=False, compute_aos=False): det_size = dt_datas.shape[0] gt_size = gt_datas.shape[0] dt_scores = dt_datas[:, -1] dt_alphas = dt_datas[:, 4] gt_alphas = gt_datas[:, 4] dt_bboxes = dt_datas[:, :4] # gt_bboxes = gt_datas[:, :4] assigned_detection = [False] * det_size ignored_threshold = [False] * det_size if compute_fp: for i in range(det_size): if (dt_scores[i] < thresh): ignored_threshold[i] = True NO_DETECTION = -10000000 tp, fp, fn, similarity = 0, 0, 0, 0 # thresholds = [0.0] # delta = [0.0] thresholds = np.zeros((gt_size, )) thresh_idx = 0 delta = np.zeros((gt_size, )) delta_idx = 0 for i in range(gt_size): if ignored_gt[i] == -1: continue det_idx = -1 valid_detection = NO_DETECTION max_overlap = 0 assigned_ignored_det = False for j in range(det_size): if (ignored_det[j] == -1): continue if (assigned_detection[j]): continue if (ignored_threshold[j]): continue overlap = overlaps[j, i] dt_score = dt_scores[j] if (not compute_fp and (overlap > min_overlap) and dt_score > valid_detection): det_idx = j valid_detection = dt_score elif (compute_fp and (overlap > min_overlap) and (overlap > max_overlap or assigned_ignored_det) and ignored_det[j] == 0): max_overlap = overlap det_idx = j valid_detection = 1 assigned_ignored_det = False elif (compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1): det_idx = j valid_detection = 1 assigned_ignored_det = True if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: fn += 1 elif ((valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)): assigned_detection[det_idx] = True elif valid_detection != NO_DETECTION: tp += 1 # thresholds.append(dt_scores[det_idx]) thresholds[thresh_idx] = dt_scores[det_idx] thresh_idx += 1 if compute_aos: # delta.append(gt_alphas[i] - dt_alphas[det_idx]) delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] delta_idx += 1 assigned_detection[det_idx] = True if compute_fp: for i in range(det_size): if (not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i])): fp += 1 nstuff = 0 if metric == 0: overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) for i in range(dc_bboxes.shape[0]): for j in range(det_size): if (assigned_detection[j]): continue if (ignored_det[j] == -1 or ignored_det[j] == 1): continue if (ignored_threshold[j]): continue if overlaps_dt_dc[j, i] > min_overlap: assigned_detection[j] = True nstuff += 1 fp -= nstuff if compute_aos: tmp = np.zeros((fp + delta_idx, )) # tmp = [0] * fp for i in range(delta_idx): tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 # tmp.append((1.0 + np.cos(delta[i])) / 2.0) # assert len(tmp) == fp + tp # assert len(delta) == tp if tp > 0 or fp > 0: similarity = np.sum(tmp) else: similarity = -1 return tp, fp, fn, similarity, thresholds[:thresh_idx] def get_split_parts(num, num_part): same_part = num // num_part remain_num = num % num_part if remain_num == 0: return [same_part] * num_part else: return [same_part] * num_part + [remain_num] @numba.jit(nopython=True) def fused_compute_statistics(overlaps, pr, gt_nums, dt_nums, dc_nums, gt_datas, dt_datas, dontcares, ignored_gts, ignored_dets, metric, min_overlap, thresholds, compute_aos=False): gt_num = 0 dt_num = 0 dc_num = 0 for i in range(gt_nums.shape[0]): for t, thresh in enumerate(thresholds): overlap = overlaps[dt_num:dt_num + dt_nums[i], gt_num:gt_num + gt_nums[i]] gt_data = gt_datas[gt_num:gt_num + gt_nums[i]] dt_data = dt_datas[dt_num:dt_num + dt_nums[i]] ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]] ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]] dontcare = dontcares[dc_num:dc_num + dc_nums[i]] tp, fp, fn, similarity, _ = compute_statistics_jit( overlap, gt_data, dt_data, ignored_gt, ignored_det, dontcare, metric, min_overlap=min_overlap, thresh=thresh, compute_fp=True, compute_aos=compute_aos) pr[t, 0] += tp pr[t, 1] += fp pr[t, 2] += fn if similarity != -1: pr[t, 3] += similarity gt_num += gt_nums[i] dt_num += dt_nums[i] dc_num += dc_nums[i] def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50): """Fast iou algorithm. this function can be used independently to do result analysis. Must be used in CAMERA coordinate system. Args: gt_annos (dict): Must from get_label_annos() in kitti_common.py. dt_annos (dict): Must from get_label_annos() in kitti_common.py. metric (int): Eval type. 0: bbox, 1: bev, 2: 3d. num_parts (int): A parameter for fast calculate algorithm. """ assert len(gt_annos) == len(dt_annos) total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0) total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0) num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) parted_overlaps = [] example_idx = 0 for num_part in split_parts: gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] if metric == 0: gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0) dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0) overlap_part = image_box_overlap(gt_boxes, dt_boxes) elif metric == 1: loc = np.concatenate( [a['location'][:, [0, 2]] for a in gt_annos_part], 0) dims = np.concatenate( [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate( [a['location'][:, [0, 2]] for a in dt_annos_part], 0) dims = np.concatenate( [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = bev_box_overlap(gt_boxes, dt_boxes).astype(np.float64) elif metric == 2: loc = np.concatenate([a['location'] for a in gt_annos_part], 0) dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate([a['location'] for a in dt_annos_part], 0) dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) else: raise ValueError('unknown metric') parted_overlaps.append(overlap_part) example_idx += num_part overlaps = [] example_idx = 0 for j, num_part in enumerate(split_parts): gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] gt_num_idx, dt_num_idx = 0, 0 for i in range(num_part): gt_box_num = total_gt_num[example_idx + i] dt_box_num = total_dt_num[example_idx + i] overlaps.append( parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num, dt_num_idx:dt_num_idx + dt_box_num]) gt_num_idx += gt_box_num dt_num_idx += dt_box_num example_idx += num_part return overlaps, parted_overlaps, total_gt_num, total_dt_num def _prepare_data(gt_annos, dt_annos, current_class, difficulty): gt_datas_list = [] dt_datas_list = [] total_dc_num = [] ignored_gts, ignored_dets, dontcares = [], [], [] total_num_valid_gt = 0 for i in range(len(gt_annos)): rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) ignored_dets.append(np.array(ignored_det, dtype=np.int64)) if len(dc_bboxes) == 0: dc_bboxes = np.zeros((0, 4)).astype(np.float64) else: dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) total_dc_num.append(dc_bboxes.shape[0]) dontcares.append(dc_bboxes) total_num_valid_gt += num_valid_gt gt_datas = np.concatenate( [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1) dt_datas = np.concatenate([ dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis], dt_annos[i]['score'][..., np.newaxis] ], 1) gt_datas_list.append(gt_datas) dt_datas_list.append(dt_datas) total_dc_num = np.stack(total_dc_num, axis=0) return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) def eval_class(gt_annos, dt_annos, current_classes, difficultys, metric, min_overlaps, compute_aos=False, num_parts=200): """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. Args: gt_annos (dict): Must from get_label_annos() in kitti_common.py. dt_annos (dict): Must from get_label_annos() in kitti_common.py. current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist. difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard metric (int): Eval type. 0: bbox, 1: bev, 2: 3d min_overlaps (float): Min overlap. format: [num_overlap, metric, class]. num_parts (int): A parameter for fast calculate algorithm Returns: dict[str, np.ndarray]: recall, precision and aos """ assert len(gt_annos) == len(dt_annos) num_examples = len(gt_annos) if num_examples < num_parts: num_parts = num_examples split_parts = get_split_parts(num_examples, num_parts) rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) overlaps, parted_overlaps, total_dt_num, total_gt_num = rets N_SAMPLE_PTS = 41 num_minoverlap = len(min_overlaps) num_class = len(current_classes) num_difficulty = len(difficultys) precision = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) recall = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) for m, current_class in enumerate(current_classes): for idx_l, difficulty in enumerate(difficultys): rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) = rets for k, min_overlap in enumerate(min_overlaps[:, metric, m]): thresholdss = [] for i in range(len(gt_annos)): rets = compute_statistics_jit( overlaps[i], gt_datas_list[i], dt_datas_list[i], ignored_gts[i], ignored_dets[i], dontcares[i], metric, min_overlap=min_overlap, thresh=0.0, compute_fp=False) tp, fp, fn, similarity, thresholds = rets thresholdss += thresholds.tolist() thresholdss = np.array(thresholdss) thresholds = get_thresholds(thresholdss, total_num_valid_gt) thresholds = np.array(thresholds) pr = np.zeros([len(thresholds), 4]) idx = 0 for j, num_part in enumerate(split_parts): gt_datas_part = np.concatenate( gt_datas_list[idx:idx + num_part], 0) dt_datas_part = np.concatenate( dt_datas_list[idx:idx + num_part], 0) dc_datas_part = np.concatenate( dontcares[idx:idx + num_part], 0) ignored_dets_part = np.concatenate( ignored_dets[idx:idx + num_part], 0) ignored_gts_part = np.concatenate( ignored_gts[idx:idx + num_part], 0) fused_compute_statistics( parted_overlaps[j], pr, total_gt_num[idx:idx + num_part], total_dt_num[idx:idx + num_part], total_dc_num[idx:idx + num_part], gt_datas_part, dt_datas_part, dc_datas_part, ignored_gts_part, ignored_dets_part, metric, min_overlap=min_overlap, thresholds=thresholds, compute_aos=compute_aos) idx += num_part for i in range(len(thresholds)): recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) precision[m, idx_l, k, i] = pr[i, 0] / ( pr[i, 0] + pr[i, 1]) if compute_aos: aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) for i in range(len(thresholds)): precision[m, idx_l, k, i] = np.max( precision[m, idx_l, k, i:], axis=-1) recall[m, idx_l, k, i] = np.max( recall[m, idx_l, k, i:], axis=-1) if compute_aos: aos[m, idx_l, k, i] = np.max( aos[m, idx_l, k, i:], axis=-1) ret_dict = { 'recall': recall, 'precision': precision, 'orientation': aos, } # clean temp variables del overlaps del parted_overlaps gc.collect() return ret_dict def get_mAP(prec): sums = 0 for i in range(0, prec.shape[-1], 4): sums = sums + prec[..., i] return sums / 11 * 100 def print_str(value, *arg, sstream=None): if sstream is None: sstream = sysio.StringIO() sstream.truncate(0) sstream.seek(0) print(value, *arg, file=sstream) return sstream.getvalue() def do_eval(gt_annos, dt_annos, current_classes, min_overlaps, eval_types=['bbox', 'bev', '3d']): # min_overlaps: [num_minoverlap, metric, num_class] difficultys = [0, 1, 2] mAP_bbox = None mAP_aos = None if 'bbox' in eval_types: ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos=('aos' in eval_types)) # ret: [num_class, num_diff, num_minoverlap, num_sample_points] mAP_bbox = get_mAP(ret['precision']) if 'aos' in eval_types: mAP_aos = get_mAP(ret['orientation']) mAP_bev = None if 'bev' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1, min_overlaps) mAP_bev = get_mAP(ret['precision']) mAP_3d = None if '3d' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) mAP_3d = get_mAP(ret['precision']) return mAP_bbox, mAP_bev, mAP_3d, mAP_aos def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos): # overlap_ranges: [range, metric, num_class] min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) for i in range(overlap_ranges.shape[1]): for j in range(overlap_ranges.shape[2]): min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j]) mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos) # ret: [num_class, num_diff, num_minoverlap] mAP_bbox = mAP_bbox.mean(-1) mAP_bev = mAP_bev.mean(-1) mAP_3d = mAP_3d.mean(-1) if mAP_aos is not None: mAP_aos = mAP_aos.mean(-1) return mAP_bbox, mAP_bev, mAP_3d, mAP_aos def kitti_eval(gt_annos, dt_annos, current_classes, eval_types=['bbox', 'bev', '3d']): """KITTI evaluation. Args: gt_annos (list[dict]): Contain gt information of each sample. dt_annos (list[dict]): Contain detected information of each sample. current_classes (list[str]): Classes to evaluation. eval_types (list[str], optional): Types to eval. Defaults to ['bbox', 'bev', '3d']. Returns: tuple: String and dict of evaluation results. """ assert len(eval_types) > 0, 'must contain at least one evaluation type' if 'aos' in eval_types: assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos' overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5], [0.7, 0.5, 0.5, 0.7, 0.5], [0.7, 0.5, 0.5, 0.7, 0.5]]) overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5], [0.5, 0.25, 0.25, 0.5, 0.25], [0.5, 0.25, 0.25, 0.5, 0.25]]) min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0) # [2, 3, 5] class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int min_overlaps = min_overlaps[:, :, current_classes] result = '' # check whether alpha is valid compute_aos = False pred_alpha = False valid_alpha_gt = False for anno in dt_annos: if anno['alpha'].shape[0] != 0: pred_alpha = True break for anno in gt_annos: if anno['alpha'][0] != -10: valid_alpha_gt = True break compute_aos = (pred_alpha and valid_alpha_gt) if compute_aos: eval_types.append('aos') mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos, current_classes, min_overlaps, eval_types) ret_dict = {} difficulty = ['easy', 'moderate', 'hard'] for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] curcls_name = class_to_name[curcls] for i in range(min_overlaps.shape[0]): # prepare results for print result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format( curcls_name, *min_overlaps[i, :, j])) if mAPbbox is not None: result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAPbbox[j, :, i]) if mAPbev is not None: result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAPbev[j, :, i]) if mAP3d is not None: result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP3d[j, :, i]) if compute_aos: result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format( *mAPaos[j, :, i]) # prepare results for logger for idx in range(3): if i == 0: postfix = f'{difficulty[idx]}_strict' else: postfix = f'{difficulty[idx]}_loose' prefix = f'KITTI/{curcls_name}' if mAP3d is not None: ret_dict[f'{prefix}_3D_{postfix}'] = mAP3d[j, idx, i] if mAPbev is not None: ret_dict[f'{prefix}_BEV_{postfix}'] = mAPbev[j, idx, i] if mAPbbox is not None: ret_dict[f'{prefix}_2D_{postfix}'] = mAPbbox[j, idx, i] # calculate mAP over all classes if there are multiple classes if len(current_classes) > 1: # prepare results for print result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty)) if mAPbbox is not None: mAPbbox = mAPbbox.mean(axis=0) result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, 0]) if mAPbev is not None: mAPbev = mAPbev.mean(axis=0) result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, 0]) if mAP3d is not None: mAP3d = mAP3d.mean(axis=0) result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0]) if compute_aos: mAPaos = mAPaos.mean(axis=0) result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0]) # prepare results for logger for idx in range(3): postfix = f'{difficulty[idx]}' if mAP3d is not None: ret_dict[f'KITTI/Overall_3D_{postfix}'] = mAP3d[idx, 0] if mAPbev is not None: ret_dict[f'KITTI/Overall_BEV_{postfix}'] = mAPbev[idx, 0] if mAPbbox is not None: ret_dict[f'KITTI/Overall_2D_{postfix}'] = mAPbbox[idx, 0] return result, ret_dict def kitti_eval_coco_style(gt_annos, dt_annos, current_classes): """coco style evaluation of kitti. Args: gt_annos (list[dict]): Contain gt information of each sample. dt_annos (list[dict]): Contain detected information of each sample. current_classes (list[str]): Classes to evaluation. Returns: string: Evaluation results. """ class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', } class_to_range = { 0: [0.5, 0.95, 10], 1: [0.25, 0.7, 10], 2: [0.25, 0.7, 10], 3: [0.5, 0.95, 10], 4: [0.25, 0.7, 10], } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int overlap_ranges = np.zeros([3, 3, len(current_classes)]) for i, curcls in enumerate(current_classes): overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:, np.newaxis] result = '' # check whether alpha is valid compute_aos = False for anno in dt_annos: if anno['alpha'].shape[0] != 0: if anno['alpha'][0] != -10: compute_aos = True break mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval( gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos) for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] o_range = np.array(class_to_range[curcls])[[0, 2, 1]] o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) result += print_str((f'{class_to_name[curcls]} ' 'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range))) result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, ' f'{mAPbbox[j, 1]:.2f}, ' f'{mAPbbox[j, 2]:.2f}')) result += print_str((f'bev AP:{mAPbev[j, 0]:.2f}, ' f'{mAPbev[j, 1]:.2f}, ' f'{mAPbev[j, 2]:.2f}')) result += print_str((f'3d AP:{mAP3d[j, 0]:.2f}, ' f'{mAP3d[j, 1]:.2f}, ' f'{mAP3d[j, 2]:.2f}')) if compute_aos: result += print_str((f'aos AP:{mAPaos[j, 0]:.2f}, ' f'{mAPaos[j, 1]:.2f}, ' f'{mAPaos[j, 2]:.2f}')) return result ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/rotate_iou.py ================================================ ##################### # Based on https://github.com/hongzhenwang/RRPN-revise # Licensed under The MIT License # Author: yanyan, scrin@foxmail.com ##################### import math import numba import numpy as np from numba import cuda @numba.jit(nopython=True) def div_up(m, n): return m // n + (m % n > 0) @cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) def trangle_area(a, b, c): return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 @cuda.jit('(float32[:], int32)', device=True, inline=True) def area(int_pts, num_of_inter): area_val = 0.0 for i in range(num_of_inter - 2): area_val += abs( trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4], int_pts[2 * i + 4:2 * i + 6])) return area_val @cuda.jit('(float32[:], int32)', device=True, inline=True) def sort_vertex_in_convex_polygon(int_pts, num_of_inter): if num_of_inter > 0: center = cuda.local.array((2, ), dtype=numba.float32) center[:] = 0.0 for i in range(num_of_inter): center[0] += int_pts[2 * i] center[1] += int_pts[2 * i + 1] center[0] /= num_of_inter center[1] /= num_of_inter v = cuda.local.array((2, ), dtype=numba.float32) vs = cuda.local.array((16, ), dtype=numba.float32) for i in range(num_of_inter): v[0] = int_pts[2 * i] - center[0] v[1] = int_pts[2 * i + 1] - center[1] d = math.sqrt(v[0] * v[0] + v[1] * v[1]) v[0] = v[0] / d v[1] = v[1] / d if v[1] < 0: v[0] = -2 - v[0] vs[i] = v[0] j = 0 temp = 0 for i in range(1, num_of_inter): if vs[i - 1] > vs[i]: temp = vs[i] tx = int_pts[2 * i] ty = int_pts[2 * i + 1] j = i while j > 0 and vs[j - 1] > temp: vs[j] = vs[j - 1] int_pts[j * 2] = int_pts[j * 2 - 2] int_pts[j * 2 + 1] = int_pts[j * 2 - 1] j -= 1 vs[j] = temp int_pts[j * 2] = tx int_pts[j * 2 + 1] = ty @cuda.jit( '(float32[:], float32[:], int32, int32, float32[:])', device=True, inline=True) def line_segment_intersection(pts1, pts2, i, j, temp_pts): A = cuda.local.array((2, ), dtype=numba.float32) B = cuda.local.array((2, ), dtype=numba.float32) C = cuda.local.array((2, ), dtype=numba.float32) D = cuda.local.array((2, ), dtype=numba.float32) A[0] = pts1[2 * i] A[1] = pts1[2 * i + 1] B[0] = pts1[2 * ((i + 1) % 4)] B[1] = pts1[2 * ((i + 1) % 4) + 1] C[0] = pts2[2 * j] C[1] = pts2[2 * j + 1] D[0] = pts2[2 * ((j + 1) % 4)] D[1] = pts2[2 * ((j + 1) % 4) + 1] BA0 = B[0] - A[0] BA1 = B[1] - A[1] DA0 = D[0] - A[0] CA0 = C[0] - A[0] DA1 = D[1] - A[1] CA1 = C[1] - A[1] acd = DA1 * CA0 > CA1 * DA0 bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) if acd != bcd: abc = CA1 * BA0 > BA1 * CA0 abd = DA1 * BA0 > BA1 * DA0 if abc != abd: DC0 = D[0] - C[0] DC1 = D[1] - C[1] ABBA = A[0] * B[1] - B[0] * A[1] CDDC = C[0] * D[1] - D[0] * C[1] DH = BA1 * DC0 - BA0 * DC1 Dx = ABBA * DC0 - BA0 * CDDC Dy = ABBA * DC1 - BA1 * CDDC temp_pts[0] = Dx / DH temp_pts[1] = Dy / DH return True return False @cuda.jit( '(float32[:], float32[:], int32, int32, float32[:])', device=True, inline=True) def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): a = cuda.local.array((2, ), dtype=numba.float32) b = cuda.local.array((2, ), dtype=numba.float32) c = cuda.local.array((2, ), dtype=numba.float32) d = cuda.local.array((2, ), dtype=numba.float32) a[0] = pts1[2 * i] a[1] = pts1[2 * i + 1] b[0] = pts1[2 * ((i + 1) % 4)] b[1] = pts1[2 * ((i + 1) % 4) + 1] c[0] = pts2[2 * j] c[1] = pts2[2 * j + 1] d[0] = pts2[2 * ((j + 1) % 4)] d[1] = pts2[2 * ((j + 1) % 4) + 1] area_abc = trangle_area(a, b, c) area_abd = trangle_area(a, b, d) if area_abc * area_abd >= 0: return False area_cda = trangle_area(c, d, a) area_cdb = area_cda + area_abc - area_abd if area_cda * area_cdb >= 0: return False t = area_cda / (area_abd - area_abc) dx = t * (b[0] - a[0]) dy = t * (b[1] - a[1]) temp_pts[0] = a[0] + dx temp_pts[1] = a[1] + dy return True @cuda.jit('(float32, float32, float32[:])', device=True, inline=True) def point_in_quadrilateral(pt_x, pt_y, corners): ab0 = corners[2] - corners[0] ab1 = corners[3] - corners[1] ad0 = corners[6] - corners[0] ad1 = corners[7] - corners[1] ap0 = pt_x - corners[0] ap1 = pt_y - corners[1] abab = ab0 * ab0 + ab1 * ab1 abap = ab0 * ap0 + ab1 * ap1 adad = ad0 * ad0 + ad1 * ad1 adap = ad0 * ap0 + ad1 * ap1 return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 @cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) def quadrilateral_intersection(pts1, pts2, int_pts): num_of_inter = 0 for i in range(4): if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): int_pts[num_of_inter * 2] = pts1[2 * i] int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] num_of_inter += 1 if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): int_pts[num_of_inter * 2] = pts2[2 * i] int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] num_of_inter += 1 temp_pts = cuda.local.array((2, ), dtype=numba.float32) for i in range(4): for j in range(4): has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) if has_pts: int_pts[num_of_inter * 2] = temp_pts[0] int_pts[num_of_inter * 2 + 1] = temp_pts[1] num_of_inter += 1 return num_of_inter @cuda.jit('(float32[:], float32[:])', device=True, inline=True) def rbbox_to_corners(corners, rbbox): # generate clockwise corners and rotate it clockwise angle = rbbox[4] a_cos = math.cos(angle) a_sin = math.sin(angle) center_x = rbbox[0] center_y = rbbox[1] x_d = rbbox[2] y_d = rbbox[3] corners_x = cuda.local.array((4, ), dtype=numba.float32) corners_y = cuda.local.array((4, ), dtype=numba.float32) corners_x[0] = -x_d / 2 corners_x[1] = -x_d / 2 corners_x[2] = x_d / 2 corners_x[3] = x_d / 2 corners_y[0] = -y_d / 2 corners_y[1] = y_d / 2 corners_y[2] = y_d / 2 corners_y[3] = -y_d / 2 for i in range(4): corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y @cuda.jit('(float32[:], float32[:])', device=True, inline=True) def inter(rbbox1, rbbox2): """Compute intersection of two rotated boxes. Args: rbox1 (np.ndarray, shape=[5]): Rotated 2d box. rbox2 (np.ndarray, shape=[5]): Rotated 2d box. Returns: float: Intersection of two rotated boxes. """ corners1 = cuda.local.array((8, ), dtype=numba.float32) corners2 = cuda.local.array((8, ), dtype=numba.float32) intersection_corners = cuda.local.array((16, ), dtype=numba.float32) rbbox_to_corners(corners1, rbbox1) rbbox_to_corners(corners2, rbbox2) num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners) sort_vertex_in_convex_polygon(intersection_corners, num_intersection) # print(intersection_corners.reshape([-1, 2])[:num_intersection]) return area(intersection_corners, num_intersection) @cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True) def devRotateIoUEval(rbox1, rbox2, criterion=-1): """Compute rotated iou on device. Args: rbox1 (np.ndarray, shape=[5]): Rotated 2d box. rbox2 (np.ndarray, shape=[5]): Rotated 2d box. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. Returns: float: iou between two input boxes. """ area1 = rbox1[2] * rbox1[3] area2 = rbox2[2] * rbox2[3] area_inter = inter(rbox1, rbox2) if criterion == -1: return area_inter / (area1 + area2 - area_inter) elif criterion == 0: return area_inter / area1 elif criterion == 1: return area_inter / area2 else: return area_inter @cuda.jit( '(int64, int64, float32[:], float32[:], float32[:], int32)', fastmath=False) def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1): """Kernel of computing rotated iou. Args: N (int): The number of boxes. K (int): The number of query boxes. dev_boxes (np.ndarray): Boxes on device. dev_query_boxes (np.ndarray): Query boxes on device. dev_iou (np.ndarray): Computed iou to return. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. """ threadsPerBlock = 8 * 8 row_start = cuda.blockIdx.x col_start = cuda.blockIdx.y tx = cuda.threadIdx.x row_size = min(N - row_start * threadsPerBlock, threadsPerBlock) col_size = min(K - col_start * threadsPerBlock, threadsPerBlock) block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) dev_query_box_idx = threadsPerBlock * col_start + tx dev_box_idx = threadsPerBlock * row_start + tx if (tx < col_size): block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] if (tx < row_size): block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] cuda.syncthreads() if tx < row_size: for i in range(col_size): offset = ( row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i) dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion) def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms in one example with numba.cuda code). convert from [this project]( https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). Args: boxes (torch.Tensor): rbboxes. format: centers, dims, angles(clockwise when positive) with the shape of [N, 5]. query_boxes (float tensor: [K, 5]): rbboxes to compute iou with boxes. device_id (int, optional): Defaults to 0. Device to use. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. Returns: np.ndarray: IoU results. """ boxes = boxes.astype(np.float32) query_boxes = query_boxes.astype(np.float32) N = boxes.shape[0] K = query_boxes.shape[0] iou = np.zeros((N, K), dtype=np.float32) if N == 0 or K == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](N, K, boxes_dev, query_boxes_dev, iou_dev, criterion) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes.dtype) ================================================ FILE: mmdet3d/core/evaluation/lyft_eval.py ================================================ import mmcv import numpy as np from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap, get_class_names, get_ious, group_by_key, wrap_in_box) from mmcv.utils import print_log from os import path as osp from terminaltables import AsciiTable def load_lyft_gts(lyft, data_root, eval_split, logger=None): """Loads ground truth boxes from database. Args: lyft (:obj:`LyftDataset`): Lyft class in the sdk. data_root (str): Root of data for reading splits. eval_split (str): Name of the split for evaluation. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. Returns: list[dict]: List of annotation dictionaries. """ split_scenes = mmcv.list_from_file( osp.join(data_root, f'{eval_split}.txt')) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in lyft.sample] assert len(sample_tokens_all) > 0, 'Error: Database has no samples!' if eval_split == 'test': # Check that you aren't trying to cheat :) assert len(lyft.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set \ but you do not have the annotations!' sample_tokens = [] for sample_token in sample_tokens_all: scene_token = lyft.get('sample', sample_token)['scene_token'] scene_record = lyft.get('scene', scene_token) if scene_record['name'] in split_scenes: sample_tokens.append(sample_token) all_annotations = [] print_log('Loading ground truth annotations...', logger=logger) # Load annotations and filter predictions and annotations. for sample_token in mmcv.track_iter_progress(sample_tokens): sample = lyft.get('sample', sample_token) sample_annotation_tokens = sample['anns'] for sample_annotation_token in sample_annotation_tokens: # Get label name in detection task and filter unused labels. sample_annotation = \ lyft.get('sample_annotation', sample_annotation_token) detection_name = sample_annotation['category_name'] if detection_name is None: continue annotation = { 'sample_token': sample_token, 'translation': sample_annotation['translation'], 'size': sample_annotation['size'], 'rotation': sample_annotation['rotation'], 'name': detection_name, } all_annotations.append(annotation) return all_annotations def load_lyft_predictions(res_path): """Load Lyft predictions from json file. Args: res_path (str): Path of result json file recording detections. Returns: list[dict]: List of prediction dictionaries. """ predictions = mmcv.load(res_path) predictions = predictions['results'] all_preds = [] for sample_token in predictions.keys(): all_preds.extend(predictions[sample_token]) return all_preds def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None): """Evaluation API for Lyft dataset. Args: lyft (:obj:`LyftDataset`): Lyft class in the sdk. data_root (str): Root of data for reading splits. res_path (str): Path of result json file recording detections. eval_set (str): Name of the split for evaluation. output_dir (str): Output directory for output json files. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. Returns: dict[str, float]: The evaluation results. """ # evaluate by lyft metrics gts = load_lyft_gts(lyft, data_root, eval_set, logger) predictions = load_lyft_predictions(res_path) class_names = get_class_names(gts) print('Calculating mAP@0.5:0.95...') iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95] metrics = {} average_precisions = \ get_classwise_aps(gts, predictions, class_names, iou_thresholds) APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]] mAPs = np.mean(average_precisions, axis=0) mAPs_cate = np.mean(average_precisions, axis=1) final_mAP = np.mean(mAPs) metrics['average_precisions'] = average_precisions.tolist() metrics['mAPs'] = mAPs.tolist() metrics['Final mAP'] = float(final_mAP) metrics['class_names'] = class_names metrics['mAPs_cate'] = mAPs_cate.tolist() APs_data = [['class', 'mAP@0.5:0.95']] for i in range(len(class_names)): row = [class_names[i], round(mAPs_cate[i], 3)] APs_data.append(row) APs_data.append(['Overall', round(final_mAP, 3)]) APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95') APs_table.inner_footing_row_border = True print_log(APs_table.table, logger=logger) res_path = osp.join(output_dir, 'lyft_metrics.json') mmcv.dump(metrics, res_path) return metrics def get_classwise_aps(gt, predictions, class_names, iou_thresholds): """Returns an array with an average precision per class. Note: Ground truth and predictions should have the following format. .. code-block:: gt = [{ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207 fbb039a550991a5149214f98cec136ac', 'translation': [974.2811881299899, 1714.6815014457964, -23.689857123368846], 'size': [1.796, 4.488, 1.664], 'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121], 'name': 'car' }] predictions = [{ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207 fbb039a550991a5149214f98cec136ac', 'translation': [971.8343488872263, 1713.6816097857359, -25.82534357061308], 'size': [2.519726579986132, 7.810161372666739, 3.483438286096803], 'rotation': [0.10913582721095375, 0.04099572636992043, 0.01927712319721745, 1.029328402625659], 'name': 'car', 'score': 0.3077029437237213 }] Args: gt (list[dict]): list of dictionaries in the format described below. predictions (list[dict]): list of dictionaries in the format described below. class_names (list[str]): list of the class names. iou_thresholds (list[float]): IOU thresholds used to calculate TP / FN Returns: np.ndarray: an array with an average precision per class. """ assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds]) gt_by_class_name = group_by_key(gt, 'name') pred_by_class_name = group_by_key(predictions, 'name') average_precisions = np.zeros((len(class_names), len(iou_thresholds))) for class_id, class_name in enumerate(class_names): if class_name in pred_by_class_name: recalls, precisions, average_precision = get_single_class_aps( gt_by_class_name[class_name], pred_by_class_name[class_name], iou_thresholds) average_precisions[class_id, :] = average_precision return average_precisions def get_single_class_aps(gt, predictions, iou_thresholds): """Compute recall and precision for all iou thresholds. Adapted from LyftDatasetDevkit. Args: gt (list[dict]): list of dictionaries in the format described above. predictions (list[dict]): list of dictionaries in the format \ described below. iou_thresholds (list[float]): IOU thresholds used to calculate \ TP / FN Returns: tuple[np.ndarray]: Returns (recalls, precisions, average precisions) for each class. """ num_gts = len(gt) image_gts = group_by_key(gt, 'sample_token') image_gts = wrap_in_box(image_gts) sample_gt_checked = { sample_token: np.zeros((len(boxes), len(iou_thresholds))) for sample_token, boxes in image_gts.items() } predictions = sorted(predictions, key=lambda x: x['score'], reverse=True) # go down dets and mark TPs and FPs num_predictions = len(predictions) tps = np.zeros((num_predictions, len(iou_thresholds))) fps = np.zeros((num_predictions, len(iou_thresholds))) for prediction_index, prediction in enumerate(predictions): predicted_box = Box3D(**prediction) sample_token = prediction['sample_token'] max_overlap = -np.inf jmax = -1 if sample_token in image_gts: gt_boxes = image_gts[sample_token] # gt_boxes per sample gt_checked = sample_gt_checked[sample_token] # gt flags per sample else: gt_boxes = [] gt_checked = None if len(gt_boxes) > 0: overlaps = get_ious(gt_boxes, predicted_box) max_overlap = np.max(overlaps) jmax = np.argmax(overlaps) for i, iou_threshold in enumerate(iou_thresholds): if max_overlap > iou_threshold: if gt_checked[jmax, i] == 0: tps[prediction_index, i] = 1.0 gt_checked[jmax, i] = 1 else: fps[prediction_index, i] = 1.0 else: fps[prediction_index, i] = 1.0 # compute precision recall fps = np.cumsum(fps, axis=0) tps = np.cumsum(tps, axis=0) recalls = tps / float(num_gts) # avoid divide by zero in case the first detection # matches a difficult ground truth precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps) aps = [] for i in range(len(iou_thresholds)): recall = recalls[:, i] precision = precisions[:, i] assert np.all(0 <= recall) & np.all(recall <= 1) assert np.all(0 <= precision) & np.all(precision <= 1) ap = get_ap(recall, precision) aps.append(ap) aps = np.array(aps) return recalls, precisions, aps ================================================ FILE: mmdet3d/core/evaluation/seg_eval.py ================================================ import numpy as np from mmcv.utils import print_log from terminaltables import AsciiTable def fast_hist(preds, labels, num_classes): """Compute the confusion matrix for every batch. Args: preds (np.ndarray): Prediction labels of points with shape of (num_points, ). labels (np.ndarray): Ground truth labels of points with shape of (num_points, ). num_classes (int): number of classes Returns: np.ndarray: Calculated confusion matrix. """ k = (labels >= 0) & (labels < num_classes) bin_count = np.bincount( num_classes * labels[k].astype(int) + preds[k], minlength=num_classes**2) return bin_count[:num_classes**2].reshape(num_classes, num_classes) def per_class_iou(hist): """Compute the per class iou. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: np.ndarray: Calculated per class iou """ return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) def get_acc(hist): """Compute the overall accuracy. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: float: Calculated overall acc """ return np.diag(hist).sum() / hist.sum() def get_acc_cls(hist): """Compute the class average accuracy. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: float: Calculated class average acc """ return np.nanmean(np.diag(hist) / hist.sum(axis=1)) def seg_eval(gt_labels, seg_preds, label2cat, logger=None): """Semantic Segmentation Evaluation. Evaluate the result of the Semantic Segmentation. Args: gt_labels (list[torch.Tensor]): Ground truth labels. seg_preds (list[torch.Tensor]): Predtictions label2cat (dict): Map from label to category. logger (logging.Logger | str | None): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Return: dict[str, float]: Dict of results. """ assert len(seg_preds) == len(gt_labels) hist_list = [] for i in range(len(seg_preds)): hist_list.append( fast_hist(seg_preds[i].numpy().astype(int), gt_labels[i].numpy().astype(int), len(label2cat))) iou = per_class_iou(sum(hist_list)) miou = np.nanmean(iou) acc = get_acc(sum(hist_list)) acc_cls = get_acc_cls(sum(hist_list)) header = ['classes'] for i in range(len(label2cat)): header.append(label2cat[i]) header.extend(['miou', 'acc', 'acc_cls']) ret_dict = dict() table_columns = [['results']] for i in range(len(label2cat)): ret_dict[label2cat[i]] = float(iou[i]) table_columns.append([f'{iou[i]:.4f}']) ret_dict['miou'] = float(miou) ret_dict['acc'] = float(acc) ret_dict['acc_cls'] = float(acc_cls) table_columns.append([f'{miou:.4f}']) table_columns.append([f'{acc:.4f}']) table_columns.append([f'{acc_cls:.4f}']) table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return ret_dict ================================================ FILE: mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py ================================================ r"""Adapted from `Waymo to KITTI converter `_. """ try: from waymo_open_dataset import dataset_pb2 as open_dataset except ImportError: raise ImportError( 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' 'to install the official devkit first.') import mmcv import numpy as np import tensorflow as tf from glob import glob from os.path import join from waymo_open_dataset import label_pb2 from waymo_open_dataset.protos import metrics_pb2 class KITTI2Waymo(object): """KITTI predictions to Waymo converter. This class serves as the converter to change predictions from KITTI to Waymo format. Args: kitti_result_files (list[dict]): Predictions in KITTI format. waymo_tfrecords_dir (str): Directory to load waymo raw data. waymo_results_save_dir (str): Directory to save converted predictions in waymo format (.bin files). waymo_results_final_path (str): Path to save combined predictions in waymo format (.bin file), like 'a/b/c.bin'. prefix (str): Prefix of filename. In general, 0 for training, 1 for validation and 2 for testing. workers (str): Number of parallel processes. """ def __init__(self, kitti_result_files, waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix, workers=64): self.kitti_result_files = kitti_result_files self.waymo_tfrecords_dir = waymo_tfrecords_dir self.waymo_results_save_dir = waymo_results_save_dir self.waymo_results_final_path = waymo_results_final_path self.prefix = prefix self.workers = int(workers) self.name2idx = {} for idx, result in enumerate(kitti_result_files): if len(result['sample_idx']) > 0: self.name2idx[str(result['sample_idx'][0])] = idx # turn on eager execution for older tensorflow versions if int(tf.__version__.split('.')[0]) < 2: tf.enable_eager_execution() self.k2w_cls_map = { 'Car': label_pb2.Label.TYPE_VEHICLE, 'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN, 'Sign': label_pb2.Label.TYPE_SIGN, 'Cyclist': label_pb2.Label.TYPE_CYCLIST, } self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0], [-1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) self.get_file_names() self.create_folder() def get_file_names(self): """Get file names of waymo raw data.""" self.waymo_tfrecord_pathnames = sorted( glob(join(self.waymo_tfrecords_dir, '*.tfrecord'))) print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.') def create_folder(self): """Create folder for data conversion.""" mmcv.mkdir_or_exist(self.waymo_results_save_dir) def parse_objects(self, kitti_result, T_k2w, context_name, frame_timestamp_micros): """Parse one prediction with several instances in kitti format and convert them to `Object` proto. Args: kitti_result (dict): Predictions in kitti format. - name (np.ndarray): Class labels of predictions. - dimensions (np.ndarray): Height, width, length of boxes. - location (np.ndarray): Bottom center of boxes (x, y, z). - rotation_y (np.ndarray): Orientation of boxes. - score (np.ndarray): Scores of predictions. T_k2w (np.ndarray): Transformation matrix from kitti to waymo. context_name (str): Context name of the frame. frame_timestamp_micros (int): Frame timestamp. Returns: :obj:`Object`: Predictions in waymo dataset Object proto. """ def parse_one_object(instance_idx): """Parse one instance in kitti format and convert them to `Object` proto. Args: instance_idx (int): Index of the instance to be converted. Returns: :obj:`Object`: Predicted instance in waymo dataset \ Object proto. """ cls = kitti_result['name'][instance_idx] length = round(kitti_result['dimensions'][instance_idx, 0], 4) height = round(kitti_result['dimensions'][instance_idx, 1], 4) width = round(kitti_result['dimensions'][instance_idx, 2], 4) x = round(kitti_result['location'][instance_idx, 0], 4) y = round(kitti_result['location'][instance_idx, 1], 4) z = round(kitti_result['location'][instance_idx, 2], 4) rotation_y = round(kitti_result['rotation_y'][instance_idx], 4) score = round(kitti_result['score'][instance_idx], 4) # y: downwards; move box origin from bottom center (kitti) to # true center (waymo) y -= height / 2 # frame transformation: kitti -> waymo x, y, z = self.transform(T_k2w, x, y, z) # different conventions heading = -(rotation_y + np.pi / 2) while heading < -np.pi: heading += 2 * np.pi while heading > np.pi: heading -= 2 * np.pi box = label_pb2.Label.Box() box.center_x = x box.center_y = y box.center_z = z box.length = length box.width = width box.height = height box.heading = heading o = metrics_pb2.Object() o.object.box.CopyFrom(box) o.object.type = self.k2w_cls_map[cls] o.score = score o.context_name = context_name o.frame_timestamp_micros = frame_timestamp_micros return o objects = metrics_pb2.Objects() for instance_idx in range(len(kitti_result['name'])): o = parse_one_object(instance_idx) objects.objects.append(o) return objects def convert_one(self, file_idx): """Convert action for single file. Args: file_idx (int): Index of the file to be converted. """ file_pathname = self.waymo_tfrecord_pathnames[file_idx] file_data = tf.data.TFRecordDataset(file_pathname, compression_type='') for frame_num, frame_data in enumerate(file_data): frame = open_dataset.Frame() frame.ParseFromString(bytearray(frame_data.numpy())) filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}' for camera in frame.context.camera_calibrations: # FRONT = 1, see dataset.proto for details if camera.name == 1: T_front_cam_to_vehicle = np.array( camera.extrinsic.transform).reshape(4, 4) T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam context_name = frame.context.name frame_timestamp_micros = frame.timestamp_micros if filename in self.name2idx: kitti_result = \ self.kitti_result_files[self.name2idx[filename]] objects = self.parse_objects(kitti_result, T_k2w, context_name, frame_timestamp_micros) else: print(filename, 'not found.') objects = metrics_pb2.Objects() with open( join(self.waymo_results_save_dir, f'{filename}.bin'), 'wb') as f: f.write(objects.SerializeToString()) def convert(self): """Convert action.""" print('Start converting ...') mmcv.track_parallel_progress(self.convert_one, range(len(self)), self.workers) print('\nFinished ...') # combine all files into one .bin pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin'))) combined = self.combine(pathnames) with open(self.waymo_results_final_path, 'wb') as f: f.write(combined.SerializeToString()) def __len__(self): """Length of the filename list.""" return len(self.waymo_tfrecord_pathnames) def transform(self, T, x, y, z): """Transform the coordinates with matrix T. Args: T (np.ndarray): Transformation matrix. x(float): Coordinate in x axis. y(float): Coordinate in y axis. z(float): Coordinate in z axis. Returns: list: Coordinates after transformation. """ pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1) pt_aft = np.matmul(T, pt_bef) return pt_aft[:3].flatten().tolist() def combine(self, pathnames): """Combine predictions in waymo format for each sample together. Args: pathnames (str): Paths to save predictions. Returns: :obj:`Objects`: Combined predictions in Objects proto. """ combined = metrics_pb2.Objects() for pathname in pathnames: objects = metrics_pb2.Objects() with open(pathname, 'rb') as f: objects.ParseFromString(f.read()) for o in objects.objects: combined.objects.append(o) return combined ================================================ FILE: mmdet3d/core/points/__init__.py ================================================ from .base_points import BasePoints from .cam_points import CameraPoints from .depth_points import DepthPoints from .lidar_points import LiDARPoints __all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints'] def get_points_type(points_type): """Get the class of points according to coordinate type. Args: points_type (str): The type of points coordinate. The valid value are "CAMERA", "LIDAR", or "DEPTH". Returns: class: Points type. """ if points_type == 'CAMERA': points_cls = CameraPoints elif points_type == 'LIDAR': points_cls = LiDARPoints elif points_type == 'DEPTH': points_cls = DepthPoints else: raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"' f' are supported, got {points_type}') return points_cls ================================================ FILE: mmdet3d/core/points/base_points.py ================================================ import numpy as np import torch from abc import abstractmethod class BasePoints(object): """Base class for Points. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int): Number of the dimension of a point. Each row is (x, y, z). Default to 3. attribute_dims (dict): Dictionary to indicate the meaning of extra dimension. Default to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Default to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, points_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == \ points_dim, tensor.size() self.tensor = tensor self.points_dim = points_dim self.attribute_dims = attribute_dims self.rotation_axis = 0 @property def coord(self): """torch.Tensor: Coordinates of each point with size (N, 3).""" return self.tensor[:, :3] @property def height(self): """torch.Tensor: A vector with height of each point.""" if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['height']] else: return None @property def color(self): """torch.Tensor: A vector with color of each point.""" if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['color']] else: return None @property def shape(self): """torch.Shape: Shape of points.""" return self.tensor.shape def shuffle(self): """Shuffle the points.""" self.tensor = self.tensor[torch.randperm( self.__len__(), device=self.tensor.device)] def rotate(self, rotation, axis=None): """Rotate points with the given rotation matrix or angle. Args: rotation (float, np.ndarray, torch.Tensor): Rotation matrix or angle. axis (int): Axis to rotate at. Defaults to None. """ if not isinstance(rotation, torch.Tensor): rotation = self.tensor.new_tensor(rotation) assert rotation.shape == torch.Size([3, 3]) or \ rotation.numel() == 1 if axis is None: axis = self.rotation_axis if rotation.numel() == 1: rot_sin = torch.sin(rotation) rot_cos = torch.cos(rotation) if axis == 1: rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]]) elif axis == 2 or axis == -1: rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) elif axis == 0: rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin], [0, rot_sin, rot_cos], [1, 0, 0]]) else: raise ValueError('axis should in range') rot_mat_T = rot_mat_T.T elif rotation.numel() == 9: rot_mat_T = rotation else: raise NotImplementedError self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T @abstractmethod def flip(self, bev_direction='horizontal'): """Flip the points in BEV along given BEV direction.""" pass def translate(self, trans_vector): """Translate points with the given translation vector. Args: trans_vector (np.ndarray, torch.Tensor): Translation vector of size 3 or nx3. """ if not isinstance(trans_vector, torch.Tensor): trans_vector = self.tensor.new_tensor(trans_vector) trans_vector = trans_vector.squeeze(0) if trans_vector.dim() == 1: assert trans_vector.shape[0] == 3 elif trans_vector.dim() == 2: assert trans_vector.shape[0] == self.tensor.shape[0] and \ trans_vector.shape[1] == 3 else: raise NotImplementedError( 'Unsupported translation vector of shape {}'.format( trans_vector.shape)) self.tensor[:, :3] += trans_vector def in_range_3d(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point (x_min, y_min, z_min, x_max, y_max, z_max) Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: torch.Tensor: A binary vector indicating whether each point is \ inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 1] > point_range[1]) & (self.tensor[:, 2] > point_range[2]) & (self.tensor[:, 0] < point_range[3]) & (self.tensor[:, 1] < point_range[4]) & (self.tensor[:, 2] < point_range[5])) return in_range_flags @abstractmethod def in_range_bev(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each point is inside \ the reference range. """ pass @abstractmethod def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted box of the same type \ in the `dst` mode. """ pass def scale(self, scale_factor): """Scale the points with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the points. """ self.tensor[:, :3] *= scale_factor def __getitem__(self, item): """ Note: The following usage are allowed: 1. `new_points = points[3]`: return a `Points` that contains only one point. 2. `new_points = points[2:10]`: return a slice of points. 3. `new_points = points[vector]`: where vector is a torch.BoolTensor with `length = len(points)`. Nonzero elements in the vector will be selected. 4. `new_points = points[3:11, vector]`: return a slice of points and attribute dims. Note that the returned Points might share storage with this Points, subject to Pytorch's indexing semantics. Returns: :obj:`BasePoints`: A new object of \ :class:`BasePoints` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type( self.tensor[item].view(1, -1), points_dim=self.points_dim, attribute_dims=self.attribute_dims) elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): start = 0 if item[1].start is None else item[1].start stop = self.tensor.shape[1] + \ 1 if item[1].stop is None else item[1].stop step = 1 if item[1].step is None else item[1].step item = list(item) item[1] = list(range(start, stop, step)) item = tuple(item) p = self.tensor[item[0], item[1]] keep_dims = list( set(item[1]).intersection(set(range(3, self.tensor.shape[1])))) if self.attribute_dims is not None: attribute_dims = self.attribute_dims.copy() for key in self.attribute_dims.keys(): cur_attribute_dim = attribute_dims[key] if isinstance(cur_attribute_dim, int): cur_attribute_dims = [cur_attribute_dim] intersect_attr = list( set(cur_attribute_dims).intersection(set(keep_dims))) if len(intersect_attr) == 1: attribute_dims[key] = intersect_attr[0] elif len(intersect_attr) > 1: attribute_dims[key] = intersect_attr else: attribute_dims.pop(key) else: attribute_dims = None elif isinstance(item, (slice, np.ndarray, torch.Tensor)): p = self.tensor[item] attribute_dims = self.attribute_dims else: raise NotImplementedError(f'Invalid slice {item}!') assert p.dim() == 2, \ f'Indexing on Points with {item} failed to return a matrix!' return original_type( p, points_dim=p.shape[1], attribute_dims=attribute_dims) def __len__(self): """int: Number of points in the current object.""" return self.tensor.shape[0] def __repr__(self): """str: Return a strings that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, points_list): """Concatenate a list of Points into a single Points. Args: points_list (list[:obj:`BasePoints`]): List of points. Returns: :obj:`BasePoints`: The concatenated Points. """ assert isinstance(points_list, (list, tuple)) if len(points_list) == 0: return cls(torch.empty(0)) assert all(isinstance(points, cls) for points in points_list) # use torch.cat (v.s. layers.cat) # so the returned points never share storage with input cat_points = cls( torch.cat([p.tensor for p in points_list], dim=0), points_dim=points_list[0].tensor.shape[1], attribute_dims=points_list[0].attribute_dims) return cat_points def to(self, device): """Convert current points to a specific device. Args: device (str | :obj:`torch.device`): The name of the device. Returns: :obj:`BasePoints`: A new boxes object on the \ specific device. """ original_type = type(self) return original_type( self.tensor.to(device), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def clone(self): """Clone the Points. Returns: :obj:`BasePoints`: Box object with the same properties \ as self. """ original_type = type(self) return original_type( self.tensor.clone(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) @property def device(self): """str: The device of the points are on.""" return self.tensor.device def __iter__(self): """Yield a point as a Tensor of shape (4,) at a time. Returns: torch.Tensor: A point of shape (4,). """ yield from self.tensor def new_point(self, data): """Create a new point object with data. The new point and its tensor has the similar properties \ as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: :obj:`BasePoints`: A new point object with ``data``, \ the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, torch.Tensor) else data.to(self.device) original_type = type(self) return original_type( new_tensor, points_dim=self.points_dim, attribute_dims=self.attribute_dims) ================================================ FILE: mmdet3d/core/points/cam_points.py ================================================ from .base_points import BasePoints class CameraPoints(BasePoints): """Points of instances in CAM coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int): Number of the dimension of a point. Each row is (x, y, z). Default to 3. attribute_dims (dict): Dictionary to indicate the meaning of extra dimension. Default to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Default to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(CameraPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 1 def flip(self, bev_direction='horizontal'): """Flip the boxes in BEV along given BEV direction.""" if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 2] = -self.tensor[:, 2] def in_range_bev(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each point is inside \ the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 2] > point_range[1]) & (self.tensor[:, 0] < point_range[2]) & (self.tensor[:, 2] < point_range[3])) return in_range_flags def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type \ in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/points/depth_points.py ================================================ from .base_points import BasePoints class DepthPoints(BasePoints): """Points of instances in DEPTH coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int): Number of the dimension of a point. Each row is (x, y, z). Default to 3. attribute_dims (dict): Dictionary to indicate the meaning of extra dimension. Default to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Default to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(DepthPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): """Flip the boxes in BEV along given BEV direction.""" if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 1] = -self.tensor[:, 1] def in_range_bev(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each point is inside \ the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 1] > point_range[1]) & (self.tensor[:, 0] < point_range[2]) & (self.tensor[:, 1] < point_range[3])) return in_range_flags def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type \ in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/points/lidar_points.py ================================================ from .base_points import BasePoints class LiDARPoints(BasePoints): """Points of instances in LIDAR coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int): Number of the dimension of a point. Each row is (x, y, z). Default to 3. attribute_dims (dict): Dictionary to indicate the meaning of extra dimension. Default to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Default to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(LiDARPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): """Flip the boxes in BEV along given BEV direction.""" if bev_direction == 'horizontal': self.tensor[:, 1] = -self.tensor[:, 1] elif bev_direction == 'vertical': self.tensor[:, 0] = -self.tensor[:, 0] def in_range_bev(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each point is inside \ the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 1] > point_range[1]) & (self.tensor[:, 0] < point_range[2]) & (self.tensor[:, 1] < point_range[3])) return in_range_flags def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type \ in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/post_processing/__init__.py ================================================ from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks, merge_aug_proposals, merge_aug_scores, multiclass_nms) from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms, circle_nms from .merge_augs import merge_aug_bboxes_3d __all__ = [ 'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes', 'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms', 'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms' ] ================================================ FILE: mmdet3d/core/post_processing/box3d_nms.py ================================================ import numba import numpy as np import torch from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu def box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, max_num, cfg, mlvl_dir_scores=None): """Multi-class nms for 3D boxes. Args: mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M). M is the dimensions of boxes. mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape (N, 4). N is the number of boxes. mlvl_scores (torch.Tensor): Multi-level boxes with shape (N, ). N is the number of boxes. score_thr (float): Score thredhold to filter boxes with low confidence. max_num (int): Maximum number of boxes will be kept. cfg (dict): Configuration dict of NMS. mlvl_dir_scores (torch.Tensor, optional): Multi-level scores of direction classifier. Defaults to None. Returns: tuple[torch.Tensor]: Return results after nms, including 3D \ bounding boxes, scores, labels and direction scores. """ # do multi class nms # the fg class id range: [0, num_classes-1] num_classes = mlvl_scores.shape[1] - 1 bboxes = [] scores = [] labels = [] dir_scores = [] for i in range(0, num_classes): # get bboxes and scores of this class cls_inds = mlvl_scores[:, i] > score_thr if not cls_inds.any(): continue _scores = mlvl_scores[cls_inds, i] _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :] if cfg.use_rotate_nms: nms_func = nms_gpu else: nms_func = nms_normal_gpu selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr) _mlvl_bboxes = mlvl_bboxes[cls_inds, :] bboxes.append(_mlvl_bboxes[selected]) scores.append(_scores[selected]) cls_label = mlvl_bboxes.new_full((len(selected), ), i, dtype=torch.long) labels.append(cls_label) if mlvl_dir_scores is not None: _mlvl_dir_scores = mlvl_dir_scores[cls_inds] dir_scores.append(_mlvl_dir_scores[selected]) if bboxes: bboxes = torch.cat(bboxes, dim=0) scores = torch.cat(scores, dim=0) labels = torch.cat(labels, dim=0) if mlvl_dir_scores is not None: dir_scores = torch.cat(dir_scores, dim=0) if bboxes.shape[0] > max_num: _, inds = scores.sort(descending=True) inds = inds[:max_num] bboxes = bboxes[inds, :] labels = labels[inds] scores = scores[inds] if mlvl_dir_scores is not None: dir_scores = dir_scores[inds] else: bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1))) scores = mlvl_scores.new_zeros((0, )) labels = mlvl_scores.new_zeros((0, ), dtype=torch.long) dir_scores = mlvl_scores.new_zeros((0, )) return bboxes, scores, labels, dir_scores def aligned_3d_nms(boxes, scores, classes, thresh): """3d nms for aligned boxes. Args: boxes (torch.Tensor): Aligned box with shape [n, 6]. scores (torch.Tensor): Scores of each box. classes (torch.Tensor): Class of each box. thresh (float): Iou threshold for nms. Returns: torch.Tensor: Indices of selected boxes. """ x1 = boxes[:, 0] y1 = boxes[:, 1] z1 = boxes[:, 2] x2 = boxes[:, 3] y2 = boxes[:, 4] z2 = boxes[:, 5] area = (x2 - x1) * (y2 - y1) * (z2 - z1) zero = boxes.new_zeros(1, ) score_sorted = torch.argsort(scores) pick = [] while (score_sorted.shape[0] != 0): last = score_sorted.shape[0] i = score_sorted[-1] pick.append(i) xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]]) yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]]) zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]]) xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]]) yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]]) zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]]) classes1 = classes[i] classes2 = classes[score_sorted[:last - 1]] inter_l = torch.max(zero, xx2 - xx1) inter_w = torch.max(zero, yy2 - yy1) inter_h = torch.max(zero, zz2 - zz1) inter = inter_l * inter_w * inter_h iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter) iou = iou * (classes1 == classes2).float() score_sorted = score_sorted[torch.nonzero( iou <= thresh, as_tuple=False).flatten()] indices = boxes.new_tensor(pick, dtype=torch.long) return indices @numba.jit(nopython=True) def circle_nms(dets, thresh, socre_thre=0, post_max_size=83): """Circular NMS. An object is only counted as positive if no other center with a higher confidence exists within a radius r using a bird-eye view distance metric. Args: dets (torch.Tensor): Detection results with the shape of [N, 3]. thresh (float): Value of threshold. post_max_size (int): Max number of prediction to be kept. Defaults to 83 Returns: torch.Tensor: Indexes of the detections to be kept. """ x1 = dets[:, 0] y1 = dets[:, 1] scores = dets[:, 2] order = scores.argsort()[::-1].astype(np.int32) # highest->lowest ndets = dets.shape[0] suppressed = np.zeros((ndets), dtype=np.int32) keep = [] for _i in range(ndets): i = order[_i] # start with highest score box if suppressed[ i] == 1: # if any box have enough iou with this, remove it continue keep.append(i) for _j in range(_i + 1, ndets): j = order[_j] if suppressed[j] == 1: continue # calculate center distance between i and j box dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2 # ovr = inter / areas[j] if dist <= thresh and scores[i] - scores[j] > socre_thre: suppressed[j] = 1 return keep[:post_max_size] ================================================ FILE: mmdet3d/core/post_processing/merge_augs.py ================================================ import torch from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg): """Merge augmented detection 3D bboxes and scores. Args: aug_results (list[dict]): The dict of detection results. The dict contains the following keys - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - scores_3d (torch.Tensor): Detection scores. - labels_3d (torch.Tensor): Predicted box labels. img_metas (list[dict]): Meta information of each sample. test_cfg (dict): Test config. Returns: dict: Bounding boxes results in cpu mode, containing merged results. - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox. - scores_3d (torch.Tensor): Merged detection scores. - labels_3d (torch.Tensor): Merged predicted box labels. """ assert len(aug_results) == len(img_metas), \ '"aug_results" should have the same length as "img_metas", got len(' \ f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}' recovered_bboxes = [] recovered_scores = [] recovered_labels = [] for bboxes, img_info in zip(aug_results, img_metas): scale_factor = img_info[0]['pcd_scale_factor'] pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip'] pcd_vertical_flip = img_info[0]['pcd_vertical_flip'] recovered_scores.append(bboxes['scores_3d']) recovered_labels.append(bboxes['labels_3d']) bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor, pcd_horizontal_flip, pcd_vertical_flip) recovered_bboxes.append(bboxes) aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes) aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev) aug_scores = torch.cat(recovered_scores, dim=0) aug_labels = torch.cat(recovered_labels, dim=0) # TODO: use a more elegent way to deal with nms if test_cfg.use_rotate_nms: nms_func = nms_gpu else: nms_func = nms_normal_gpu merged_bboxes = [] merged_scores = [] merged_labels = [] # Apply multi-class nms when merge bboxes if len(aug_labels) == 0: return bbox3d2result(aug_bboxes, aug_scores, aug_labels) for class_id in range(torch.max(aug_labels).item() + 1): class_inds = (aug_labels == class_id) bboxes_i = aug_bboxes[class_inds] bboxes_nms_i = aug_bboxes_for_nms[class_inds, :] scores_i = aug_scores[class_inds] labels_i = aug_labels[class_inds] if len(bboxes_nms_i) == 0: continue selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr) merged_bboxes.append(bboxes_i[selected, :]) merged_scores.append(scores_i[selected]) merged_labels.append(labels_i[selected]) merged_bboxes = merged_bboxes[0].cat(merged_bboxes) merged_scores = torch.cat(merged_scores, dim=0) merged_labels = torch.cat(merged_labels, dim=0) _, order = merged_scores.sort(0, descending=True) num = min(test_cfg.max_num, len(aug_bboxes)) order = order[:num] merged_bboxes = merged_bboxes[order] merged_scores = merged_scores[order] merged_labels = merged_labels[order] return bbox3d2result(merged_bboxes, merged_scores, merged_labels) ================================================ FILE: mmdet3d/core/utils/__init__.py ================================================ from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius __all__ = ['gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian'] ================================================ FILE: mmdet3d/core/utils/gaussian.py ================================================ import numpy as np import torch def gaussian_2d(shape, sigma=1): """Generate gaussian map. Args: shape (list[int]): Shape of the map. sigma (float): Sigma to generate gaussian map. Defaults to 1. Returns: np.ndarray: Generated gaussian map. """ m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_heatmap_gaussian(heatmap, center, radius, k=1): """Get gaussian masked heatmap. Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. radius (int): Radius of gausian. K (int): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. """ diameter = 2 * radius + 1 gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = torch.from_numpy( gaussian[radius - top:radius + bottom, radius - left:radius + right]).to(heatmap.device, torch.float32) if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap def gaussian_radius(det_size, min_overlap=0.5): """Get radius of gaussian. Args: det_size (tuple[torch.Tensor]): Size of the detection result. min_overlap (float): Gaussian_overlap. Defaults to 0.5. Returns: torch.Tensor: Computed radius. """ height, width = det_size a1 = 1 b1 = (height + width) c1 = width * height * (1 - min_overlap) / (1 + min_overlap) sq1 = torch.sqrt(b1**2 - 4 * a1 * c1) r1 = (b1 + sq1) / 2 a2 = 4 b2 = 2 * (height + width) c2 = (1 - min_overlap) * width * height sq2 = torch.sqrt(b2**2 - 4 * a2 * c2) r2 = (b2 + sq2) / 2 a3 = 4 * min_overlap b3 = -2 * min_overlap * (height + width) c3 = (min_overlap - 1) * width * height sq3 = torch.sqrt(b3**2 - 4 * a3 * c3) r3 = (b3 + sq3) / 2 return min(r1, r2, r3) ================================================ FILE: mmdet3d/core/visualizer/__init__.py ================================================ from .show_result import show_result __all__ = ['show_result'] ================================================ FILE: mmdet3d/core/visualizer/open3d_vis.py ================================================ import cv2 import numpy as np import torch from matplotlib import pyplot as plt try: import open3d as o3d from open3d import geometry except ImportError: raise ImportError( 'Please run "pip install open3d" to install open3d first.') def _draw_points(points, vis, points_size=2, point_color=(0.5, 0.5, 0.5), mode='xyz'): """Draw points on visualizer. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. points_size (int): the size of points to show on visualizer. Default: 2. point_color (tuple[float]): the color of points. Default: (0.5, 0.5, 0.5). mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. Returns: tuple: points, color of each point. """ vis.get_render_option().point_size = points_size # set points size if isinstance(points, torch.Tensor): points = points.cpu().numpy() points = points.copy() pcd = geometry.PointCloud() if mode == 'xyz': pcd.points = o3d.utility.Vector3dVector(points[:, :3]) points_colors = np.tile(np.array(point_color), (points.shape[0], 1)) elif mode == 'xyzrgb': pcd.points = o3d.utility.Vector3dVector(points[:, :3]) points_colors = points[:, 3:6] else: raise NotImplementedError pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.add_geometry(pcd) return pcd, points_colors def _draw_bboxes(bbox3d, vis, points_colors, pcd=None, bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox on visualizer and change the color of points inside bbox3d. Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. points_colors (numpy.array): color of each points. pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None. bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float]): the color of points inside bbox3d. Default: (1, 0, 0). rot_axis (int): rotation axis of bbox. Default: 2. center_mode (bool): indicate the center of bbox is bottom center or gravity center. avaliable mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() bbox3d = bbox3d.copy() in_box_color = np.array(points_in_box_color) for i in range(len(bbox3d)): center = bbox3d[i, 0:3] dim = bbox3d[i, 3:6] yaw = np.zeros(3) yaw[rot_axis] = -bbox3d[i, 6] rot_mat = geometry.get_rotation_matrix_from_xyz(yaw) if center_mode == 'lidar_bottom': center[rot_axis] += dim[ rot_axis] / 2 # bottom center to gravity center elif center_mode == 'camera_bottom': center[rot_axis] -= dim[ rot_axis] / 2 # bottom center to gravity center box3d = geometry.OrientedBoundingBox(center, rot_mat, dim) line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d) line_set.paint_uniform_color(bbox_color) # draw bboxes on visualizer vis.add_geometry(line_set) # change the color of points which are in box if pcd is not None and mode == 'xyz': indices = box3d.get_point_indices_within_bounding_box(pcd.points) points_colors[indices] = in_box_color # update points colors if pcd is not None: pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.update_geometry(pcd) def show_pts_boxes(points, bbox3d=None, show=True, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox and points on visualizer. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None. show (bool): whether to show the visualization results. Default: True. save_path (str): path to save visualized results. Default: None. points_size (int): the size of points to show on visualizer. Default: 2. point_color (tuple[float]): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float]): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int): rotation axis of bbox. Default: 2. center_mode (bool): indicate the center of bbox is bottom center or gravity center. avaliable mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 # init visualizer vis = o3d.visualization.Visualizer() vis.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame vis.add_geometry(mesh_frame) # draw points pcd, points_colors = _draw_points(points, vis, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) if show: vis.run() if save_path is not None: vis.capture_screen_image(save_path) vis.destroy_window() def _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd=None, bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox on visualizer and change the color or points inside bbox3d with indices. Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. indices (numpy.array | torch.tensor, shape=[N, M]): indicate which bbox3d that each point lies in. points_colors (numpy.array): color of each points. pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None. bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float]): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int): rotation axis of bbox. Default: 2. center_mode (bool): indicate the center of bbox is bottom center or gravity center. avaliable mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() if isinstance(indices, torch.Tensor): indices = indices.cpu().numpy() bbox3d = bbox3d.copy() in_box_color = np.array(points_in_box_color) for i in range(len(bbox3d)): center = bbox3d[i, 0:3] dim = bbox3d[i, 3:6] yaw = np.zeros(3) # TODO: fix problem of current coordinate system # dim[0], dim[1] = dim[1], dim[0] # for current coordinate # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi) yaw[rot_axis] = -bbox3d[i, 6] rot_mat = geometry.get_rotation_matrix_from_xyz(yaw) if center_mode == 'lidar_bottom': center[rot_axis] += dim[ rot_axis] / 2 # bottom center to gravity center elif center_mode == 'camera_bottom': center[rot_axis] -= dim[ rot_axis] / 2 # bottom center to gravity center box3d = geometry.OrientedBoundingBox(center, rot_mat, dim) line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d) line_set.paint_uniform_color(bbox_color) # draw bboxes on visualizer vis.add_geometry(line_set) # change the color of points which are in box if pcd is not None and mode == 'xyz': points_colors[indices[:, i].astype(np.bool)] = in_box_color # update points colors if pcd is not None: pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.update_geometry(pcd) def show_pts_index_boxes(points, bbox3d=None, show=True, indices=None, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox and points on visualizer with indices that indicate which bbox3d that each point lies in. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None. show (bool): whether to show the visualization results. Default: True. indices (numpy.array | torch.tensor, shape=[N, M]): indicate which bbox3d that each point lies in. Default: None. save_path (str): path to save visualized results. Default: None. points_size (int): the size of points to show on visualizer. Default: 2. point_color (tuple[float]): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float]): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int): rotation axis of bbox. Default: 2. center_mode (bool): indicate the center of bbox is bottom center or gravity center. avaliable mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 # init visualizer vis = o3d.visualization.Visualizer() vis.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame vis.add_geometry(mesh_frame) # draw points pcd, points_colors = _draw_points(points, vis, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) if show: vis.run() if save_path is not None: vis.capture_screen_image(save_path) vis.destroy_window() def project_pts_on_img(points, raw_img, lidar2img_rt, max_distance=70, thickness=-1): """Project the 3D points cloud on 2D image. Args: points (numpy.array): 3D points cloud (x, y, z) to visualize. raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. max_distance (float): the max distance of the points cloud. Default: 70. thickness (int, optional): The thickness of 2D points. Default: -1. """ img = raw_img.copy() num_points = points.shape[0] pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1) pts_2d = pts_4d @ lidar2img_rt.T # cam_points is Tensor of Nx4 whose last column is 1 # transform camera coordinate to image coordinate pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] fov_inds = ((pts_2d[:, 0] < img.shape[1]) & (pts_2d[:, 0] >= 0) & (pts_2d[:, 1] < img.shape[0]) & (pts_2d[:, 1] >= 0)) imgfov_pts_2d = pts_2d[fov_inds, :3] # u, v, d cmap = plt.cm.get_cmap('hsv', 256) cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255 for i in range(imgfov_pts_2d.shape[0]): depth = imgfov_pts_2d[i, 2] color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :] cv2.circle( img, center=(int(np.round(imgfov_pts_2d[i, 0])), int(np.round(imgfov_pts_2d[i, 1]))), radius=1, color=tuple(color), thickness=thickness, ) cv2.imshow('project_pts_img', img) cv2.waitKey(100) def project_bbox3d_on_img(bboxes3d, raw_img, lidar2img_rt, color=(0, 255, 0), thickness=1): """Project the 3D bbox on 2D image. Args: bboxes3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. color (tuple[int]): the color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ img = raw_img.copy() corners_3d = bboxes3d.corners num_bbox = corners_3d.shape[0] pts_4d = np.concatenate( [corners_3d.reshape(-1, 3), np.ones((num_bbox * 8, 1))], axis=-1) pts_2d = pts_4d @ lidar2img_rt.T pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2) line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7), (4, 5), (4, 7), (2, 6), (5, 6), (6, 7)) for i in range(num_bbox): corners = imgfov_pts_2d[i].astype(np.int) for start, end in line_indices: cv2.line(img, (corners[start, 0], corners[start, 1]), (corners[end, 0], corners[end, 1]), color, thickness, cv2.LINE_AA) cv2.imshow('project_bbox3d_img', img) cv2.waitKey(0) class Visualizer(object): r"""Online visualizer implemented with Open3d. Args: points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points cloud is in mode of Coord3DMode.DEPTH (please refer to core.structures.coord_3d_mode). bbox3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. The 3d bbox is in mode of Box3DMode.DEPTH with gravity_center (please refer to core.structures.box_3d_mode). Default: None. save_path (str): path to save visualized results. Default: None. points_size (int): the size of points to show on visualizer. Default: 2. point_color (tuple[float]): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float]): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int): rotation axis of bbox. Default: 2. center_mode (bool): indicate the center of bbox is bottom center or gravity center. avaliable mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str): indicate type of the input points, avaliable mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ def __init__(self, points, bbox3d=None, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): super(Visualizer, self).__init__() assert 0 <= rot_axis <= 2 # init visualizer self.o3d_visualizer = o3d.visualization.Visualizer() self.o3d_visualizer.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame self.o3d_visualizer.add_geometry(mesh_frame) self.points_size = points_size self.point_color = point_color self.bbox_color = bbox_color self.points_in_box_color = points_in_box_color self.rot_axis = rot_axis self.center_mode = center_mode self.mode = mode # draw points if points is not None: self.pcd, self.points_colors = _draw_points( points, self.o3d_visualizer, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None): """Add bounding box to visualizer. Args: bbox3d (numpy.array, shape=[M, 7]): 3D bbox (x, y, z, dx, dy, dz, yaw) to be visualized. The 3d bbox is in mode of Box3DMode.DEPTH with gravity_center (please refer to core.structures.box_3d_mode). bbox_color (tuple[float]): the color of bbox. Defaule: None. points_in_box_color (tuple[float]): the color of points which are in bbox3d. Defaule: None. """ if bbox_color is None: bbox_color = self.bbox_color if points_in_box_color is None: points_in_box_color = self.points_in_box_color _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd, bbox_color, points_in_box_color, self.rot_axis, self.center_mode, self.mode) def show(self, save_path=None): """Visualize the points cloud. Args: save_path (str): path to save image. Default: None. """ self.o3d_visualizer.run() if save_path is not None: self.o3d_visualizer.capture_screen_image(save_path) self.o3d_visualizer.destroy_window() return ================================================ FILE: mmdet3d/core/visualizer/show_result.py ================================================ import mmcv import numpy as np import trimesh from os import path as osp def _write_ply(points, out_filename): """Write points into ``ply`` format for meshlab visualization. Args: points (np.ndarray): Points in shape (N, dim). out_filename (str): Filename to be saved. """ N = points.shape[0] fout = open(out_filename, 'w') for i in range(N): if points.shape[1] == 6: c = points[i, 3:].astype(int) fout.write( 'v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) else: fout.write('v %f %f %f\n' % (points[i, 0], points[i, 1], points[i, 2])) fout.close() def _write_oriented_bbox(scene_bbox, out_filename): """Export oriented (around Z axis) scene bbox to meshes. Args: scene_bbox(list[ndarray] or ndarray): xyz pos of center and 3 lengths (dx,dy,dz) and heading angle around Z axis. Y forward, X right, Z upward. heading angle of positive X is 0, heading angle of positive Y is 90 degrees. out_filename(str): Filename. """ def heading2rotmat(heading_angle): rotmat = np.zeros((3, 3)) rotmat[2, 2] = 1 cosval = np.cos(heading_angle) sinval = np.sin(heading_angle) rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]]) return rotmat def convert_oriented_box_to_trimesh_fmt(box): ctr = box[:3] lengths = box[3:6] trns = np.eye(4) trns[0:3, 3] = ctr trns[3, 3] = 1.0 trns[0:3, 0:3] = heading2rotmat(box[6]) box_trimesh_fmt = trimesh.creation.box(lengths, trns) return box_trimesh_fmt if len(scene_bbox) == 0: scene_bbox = np.zeros((1, 7)) scene = trimesh.scene.Scene() for box in scene_bbox: scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box)) mesh_list = trimesh.util.concatenate(scene.dump()) # save to ply file trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='ply') return def show_result(points, gt_bboxes, pred_bboxes, out_dir, filename, show=True): """Convert results into format that is directly readable for meshlab. Args: points (np.ndarray): Points. gt_bboxes (np.ndarray): Ground truth boxes. pred_bboxes (np.ndarray): Predicted boxes. out_dir (str): Path of output directory filename (str): Filename of the current frame. show (bool): Visualize the results online. """ if show: from .open3d_vis import Visualizer vis = Visualizer(points) if pred_bboxes is not None: vis.add_bboxes(bbox3d=pred_bboxes) if gt_bboxes is not None: vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1)) vis.show() result_path = osp.join(out_dir, filename) mmcv.mkdir_or_exist(result_path) if points is not None: _write_ply(points, osp.join(result_path, f'{filename}_points.obj')) if gt_bboxes is not None: # bottom center to gravity center gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2 # the positive direction for yaw in meshlab is clockwise gt_bboxes[:, 6] *= -1 _write_oriented_bbox(gt_bboxes, osp.join(result_path, f'{filename}_gt.ply')) if pred_bboxes is not None: # bottom center to gravity center pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2 # the positive direction for yaw in meshlab is clockwise pred_bboxes[:, 6] *= -1 _write_oriented_bbox(pred_bboxes, osp.join(result_path, f'{filename}_pred.ply')) ================================================ FILE: mmdet3d/core/voxel/__init__.py ================================================ from .builder import build_voxel_generator from .voxel_generator import VoxelGenerator __all__ = ['build_voxel_generator', 'VoxelGenerator'] ================================================ FILE: mmdet3d/core/voxel/builder.py ================================================ import mmcv from . import voxel_generator def build_voxel_generator(cfg, **kwargs): """Builder of voxel generator.""" if isinstance(cfg, voxel_generator.VoxelGenerator): return cfg elif isinstance(cfg, dict): return mmcv.runner.obj_from_dict( cfg, voxel_generator, default_args=kwargs) else: raise TypeError('Invalid type {} for building a sampler'.format( type(cfg))) ================================================ FILE: mmdet3d/core/voxel/voxel_generator.py ================================================ import numba import numpy as np class VoxelGenerator(object): """Voxel generator in numpy implementation. Args: voxel_size (list[float]): Size of a single voxel point_cloud_range (list[float]): Range of points max_num_points (int): Maximum number of points in a single voxel max_voxels (int, optional): Maximum number of voxels. Defaults to 20000. """ def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000): point_cloud_range = np.array(point_cloud_range, dtype=np.float32) # [0, -40, -3, 70.4, 40, 1] voxel_size = np.array(voxel_size, dtype=np.float32) grid_size = (point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size grid_size = np.round(grid_size).astype(np.int64) self._voxel_size = voxel_size self._point_cloud_range = point_cloud_range self._max_num_points = max_num_points self._max_voxels = max_voxels self._grid_size = grid_size def generate(self, points): """Generate voxels given points.""" return points_to_voxel(points, self._voxel_size, self._point_cloud_range, self._max_num_points, True, self._max_voxels) @property def voxel_size(self): """list[float]: Size of a single voxel.""" return self._voxel_size @property def max_num_points_per_voxel(self): """int: Maximum number of points per voxel.""" return self._max_num_points @property def point_cloud_range(self): """list[float]: Range of point cloud.""" return self._point_cloud_range @property def grid_size(self): """np.ndarray: The size of grids.""" return self._grid_size def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ indent = ' ' * (len(repr_str) + 1) repr_str += f'(voxel_size={self._voxel_size},\n' repr_str += indent + 'point_cloud_range=' repr_str += f'{self._point_cloud_range.tolist()},\n' repr_str += indent + f'max_num_points={self._max_num_points},\n' repr_str += indent + f'max_voxels={self._max_voxels},\n' repr_str += indent + f'grid_size={self._grid_size.tolist()}' repr_str += ')' return repr_str def points_to_voxel(points, voxel_size, coors_range, max_points=35, reverse_index=True, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size coors_range (list[float | tuple[float] | ndarray]): Voxel range. \ format: xyzxyz, minmax max_points (int): Indicate maximum points contained in a voxel. reverse_index (bool): Whether return reversed coordinates. \ if points has xyz format and reverse_index is True, output \ coordinates will be zyx format, but points in features always \ xyz format. max_voxels (int): Maximum number of voxels this function creates. \ For second, 20000 is a good choice. Points should be shuffled for \ randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: [M, max_points, ndim] float tensor. only contain points. coordinates: [M, 3] int32 tensor. num_points_per_voxel: [M] int32 tensor. """ if not isinstance(voxel_size, np.ndarray): voxel_size = np.array(voxel_size, dtype=points.dtype) if not isinstance(coors_range, np.ndarray): coors_range = np.array(coors_range, dtype=points.dtype) voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) if reverse_index: voxelmap_shape = voxelmap_shape[::-1] # don't create large array in jit(nopython=True) code. num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32) coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) voxels = np.zeros( shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype) coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) if reverse_index: voxel_num = _points_to_voxel_reverse_kernel( points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points, max_voxels) else: voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points, max_voxels) coors = coors[:voxel_num] voxels = voxels[:voxel_num] num_points_per_voxel = num_points_per_voxel[:voxel_num] return voxels, coors, num_points_per_voxel @numba.jit(nopython=True) def _points_to_voxel_reverse_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points=35, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size \ coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \ format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \ which has the same shape as the complete voxel map. It indicates \ the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. max_voxels (int): Maximum number of voxels this function create. \ for second, 20000 is a good choice. Points should be shuffled for \ randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: Shape [M, max_points, ndim], only contain points. coordinates: Shape [M, 3]. num_points_per_voxel: Shape [M]. """ # put all computations to one loop. # we shouldn't create large array in main jit code, otherwise # reduce performance N = points.shape[0] # ndim = points.shape[1] - 1 ndim = 3 ndim_minus_1 = ndim - 1 grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size # np.round(grid_size) # grid_size = np.round(grid_size).astype(np.int64)(np.int32) grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) coor = np.zeros(shape=(3, ), dtype=np.int32) voxel_num = 0 failed = False for i in range(N): failed = False for j in range(ndim): c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) if c < 0 or c >= grid_size[j]: failed = True break coor[ndim_minus_1 - j] = c if failed: continue voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] if voxelidx == -1: voxelidx = voxel_num if voxel_num >= max_voxels: break voxel_num += 1 coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx coors[voxelidx] = coor num = num_points_per_voxel[voxelidx] if num < max_points: voxels[voxelidx, num] = points[i] num_points_per_voxel[voxelidx] += 1 return voxel_num @numba.jit(nopython=True) def _points_to_voxel_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points=35, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \ points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size. coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \ format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \ which has the same shape as the complete voxel map. It indicates \ the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. max_voxels (int): Maximum number of voxels this function create. \ for second, 20000 is a good choice. Points should be shuffled for \ randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: Shape [M, max_points, ndim], only contain points. coordinates: Shape [M, 3]. num_points_per_voxel: Shape [M]. """ N = points.shape[0] # ndim = points.shape[1] - 1 ndim = 3 grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size # grid_size = np.round(grid_size).astype(np.int64)(np.int32) grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) # lower_bound = coors_range[:3] # upper_bound = coors_range[3:] coor = np.zeros(shape=(3, ), dtype=np.int32) voxel_num = 0 failed = False for i in range(N): failed = False for j in range(ndim): c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) if c < 0 or c >= grid_size[j]: failed = True break coor[j] = c if failed: continue voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] if voxelidx == -1: voxelidx = voxel_num if voxel_num >= max_voxels: break voxel_num += 1 coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx coors[voxelidx] = coor num = num_points_per_voxel[voxelidx] if num < max_points: voxels[voxelidx, num] = points[i] num_points_per_voxel[voxelidx] += 1 return voxel_num ================================================ FILE: mmdet3d/datasets/__init__.py ================================================ from mmdet.datasets.builder import build_dataloader from .builder import DATASETS, build_dataset from .custom_3d import Custom3DDataset from .kitti_dataset import KittiDataset from .lyft_dataset import LyftDataset from .nuscenes_dataset import NuScenesDataset from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans, IndoorPointSample, LoadAnnotations3D, LoadPointsFromFile, LoadPointsFromMultiSweeps, NormalizePointsColor, ObjectNoise, ObjectRangeFilter, ObjectSample, PointShuffle, PointsRangeFilter, RandomFlip3D, VoxelBasedPointSampler) from .scannet_dataset import ScanNetDataset from .semantickitti_dataset import SemanticKITTIDataset from .sunrgbd_dataset import SUNRGBDDataset from .waymo_dataset import WaymoDataset from .nuscenes_dataset_viewInfo import NuScenesDataset_ViewInfo __all__ = [ 'KittiDataset', 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset', 'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D', 'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample', 'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset', 'SemanticKITTIDataset', 'Custom3DDataset', 'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'NuScenesDataset_ViewInfo' ] ================================================ FILE: mmdet3d/datasets/builder.py ================================================ import platform from mmcv.utils import build_from_cfg from mmdet.datasets import DATASETS from mmdet.datasets.builder import _concat_dataset if platform.system() != 'Windows': # https://github.com/pytorch/pytorch/issues/973 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) hard_limit = rlimit[1] soft_limit = min(4096, hard_limit) resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) def build_dataset(cfg, default_args=None): from mmdet3d.datasets.dataset_wrappers import CBGSDataset from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, ConcatDataset, RepeatDataset) if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'ConcatDataset': dataset = ConcatDataset( [build_dataset(c, default_args) for c in cfg['datasets']], cfg.get('separate_eval', True)) elif cfg['type'] == 'RepeatDataset': dataset = RepeatDataset( build_dataset(cfg['dataset'], default_args), cfg['times']) elif cfg['type'] == 'ClassBalancedDataset': dataset = ClassBalancedDataset( build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) elif cfg['type'] == 'CBGSDataset': dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args)) elif isinstance(cfg.get('ann_file'), (list, tuple)): dataset = _concat_dataset(cfg, default_args) else: dataset = build_from_cfg(cfg, DATASETS, default_args) return dataset ================================================ FILE: mmdet3d/datasets/custom_3d.py ================================================ import mmcv import numpy as np import tempfile from os import path as osp from torch.utils.data import Dataset from mmdet.datasets import DATASETS from ..core.bbox import get_box_type from .pipelines import Compose @DATASETS.register_module() class Custom3DDataset(Dataset): """Customized 3D dataset. This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI dataset. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR'. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False): super().__init__() self.data_root = data_root self.ann_file = ann_file self.test_mode = test_mode self.modality = modality self.filter_empty_gt = filter_empty_gt self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) self.CLASSES = self.get_classes(classes) self.cat2id = {name: i for i, name in enumerate(self.CLASSES)} self.data_infos = self.load_annotations(self.ann_file) if pipeline is not None: self.pipeline = Compose(pipeline) # set group flag for the sampler if not self.test_mode: self._set_group_flag() def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ return mmcv.load(ann_file) def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] pts_filename = osp.join(self.data_root, info['pts_path']) input_dict = dict( pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any(): return None return input_dict def pre_pipeline(self, results): """Initialization before data preparation. Args: results (dict): Dict before data preprocessing. - img_fields (list): Image fields. - bbox3d_fields (list): 3D bounding boxes fields. - pts_mask_fields (list): Mask fields of points. - pts_seg_fields (list): Mask fields of point segments. - bbox_fields (list): Fields of bounding boxes. - mask_fields (list): Fields of masks. - seg_fields (list): Segment fields. - box_type_3d (str): 3D box type. - box_mode_3d (str): 3D box mode. """ results['img_fields'] = [] results['bbox3d_fields'] = [] results['pts_mask_fields'] = [] results['pts_seg_fields'] = [] results['bbox_fields'] = [] results['mask_fields'] = [] results['seg_fields'] = [] results['box_type_3d'] = self.box_type_3d results['box_mode_3d'] = self.box_mode_3d def prepare_train_data(self, index): """Training data preparation. Args: index (int): Index for accessing the target data. Returns: dict: Training data dict of the corresponding index. """ input_dict = self.get_data_info(index) if input_dict is None: return None self.pre_pipeline(input_dict) example = self.pipeline(input_dict) if self.filter_empty_gt and \ (example is None or ~(example['gt_labels_3d']._data != -1).any()): return None return example def prepare_test_data(self, index): """Prepare data for testing. Args: index (int): Index for accessing the target data. Returns: dict: Testing data dict of the corresponding index. """ input_dict = self.get_data_info(index) self.pre_pipeline(input_dict) example = self.pipeline(input_dict) return example @classmethod def get_classes(cls, classes=None): """Get class names of current dataset. Args: classes (Sequence[str] | str | None): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Return: list[str]: A list of class names. """ if classes is None: return cls.CLASSES if isinstance(classes, str): # take it as a file path class_names = mmcv.list_from_file(classes) elif isinstance(classes, (tuple, list)): class_names = classes else: raise ValueError(f'Unsupported type {type(classes)} of classes.') return class_names def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str | None): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (outputs, tmp_dir), outputs is the detection results, \ tmp_dir is the temporal directory created for saving json \ files when ``jsonfile_prefix`` is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') out = f'{pklfile_prefix}.pkl' mmcv.dump(outputs, out) return outputs, tmp_dir def evaluate(self, results, metric=None, iou_thr=(0.25, 0.5), logger=None, show=False, out_dir=None): """Evaluate. Evaluation in indoor protocol. Args: results (list[dict]): List of results. metric (str | list[str]): Metrics to be evaluated. iou_thr (list[float]): AP IoU thresholds. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. Returns: dict: Evaluation results. """ from mmdet3d.core.evaluation import indoor_eval assert isinstance( results, list), f'Expect results to be list, got {type(results)}.' assert len(results) > 0, 'Expect length of results > 0.' assert len(results) == len(self.data_infos) assert isinstance( results[0], dict ), f'Expect elements in results to be dict, got {type(results[0])}.' gt_annos = [info['annos'] for info in self.data_infos] label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)} ret_dict = indoor_eval( gt_annos, results, iou_thr, label2cat, logger=logger, box_type_3d=self.box_type_3d, box_mode_3d=self.box_mode_3d) if show: self.show(results, out_dir) return ret_dict def __len__(self): """Return the length of data infos. Returns: int: Length of data infos. """ return len(self.data_infos) def _rand_another(self, idx): """Randomly get another item with the same flag. Returns: int: Another index of item with the same flag. """ pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ if self.test_mode: return self.prepare_test_data(idx) while True: data = self.prepare_train_data(idx) if data is None: idx = self._rand_another(idx) continue return data def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. In 3D datasets, they are all the same, thus are all zeros. """ self.flag = np.zeros(len(self), dtype=np.uint8) ================================================ FILE: mmdet3d/datasets/dataset_wrappers.py ================================================ import numpy as np from .builder import DATASETS @DATASETS.register_module() class CBGSDataset(object): """A wrapper of class sampled dataset with ann_file path. Implementation of paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection `_. Balance the number of scenes under different classes. Args: dataset (:obj:`CustomDataset`): The dataset to be class sampled. """ def __init__(self, dataset): self.dataset = dataset self.CLASSES = dataset.CLASSES self.cat2id = {name: i for i, name in enumerate(self.CLASSES)} self.sample_indices = self._get_sample_indices() # self.dataset.data_infos = self.data_infos if hasattr(self.dataset, 'flag'): self.flag = np.array( [self.dataset.flag[ind] for ind in self.sample_indices], dtype=np.uint8) def _get_sample_indices(self): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations after class sampling. """ class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()} for idx in range(len(self.dataset)): sample_cat_ids = self.dataset.get_cat_ids(idx) for cat_id in sample_cat_ids: class_sample_idxs[cat_id].append(idx) duplicated_samples = sum( [len(v) for _, v in class_sample_idxs.items()]) class_distribution = { k: len(v) / duplicated_samples for k, v in class_sample_idxs.items() } sample_indices = [] frac = 1.0 / len(self.CLASSES) ratios = [frac / v for v in class_distribution.values()] for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios): sample_indices += np.random.choice(cls_inds, int(len(cls_inds) * ratio)).tolist() return sample_indices def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ ori_idx = self.sample_indices[idx] return self.dataset[ori_idx] def __len__(self): """Return the length of data infos. Returns: int: Length of data infos. """ return len(self.sample_indices) ================================================ FILE: mmdet3d/datasets/kitti2d_dataset.py ================================================ import mmcv import numpy as np from mmdet.datasets import DATASETS, CustomDataset @DATASETS.register_module() class Kitti2DDataset(CustomDataset): r"""KITTI 2D Dataset. This class serves as the API for experiments on the `KITTI Dataset `_. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR'. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('car', 'pedestrian', 'cyclist') """ Annotation format: [ { 'image': { 'image_idx': 0, 'image_path': 'training/image_2/000000.png', 'image_shape': array([ 370, 1224], dtype=int32) }, 'point_cloud': { 'num_features': 4, 'velodyne_path': 'training/velodyne/000000.bin' }, 'calib': { 'P0': (4, 4), 'P1': (4, 4), 'P2': (4, 4), 'P3': (4, 4), 'R0_rect':4x4 np.array, 'Tr_velo_to_cam': 4x4 np.array, 'Tr_imu_to_velo': 4x4 np.array }, 'annos': { 'name': (n), 'truncated': (n), 'occluded': (n), 'alpha': (n), 'bbox': (n, 4), 'dimensions': (n, 3), 'location': (n, 3), 'rotation_y': (n), 'score': (n), 'index': array([0], dtype=int32), 'group_ids': array([0], dtype=int32), 'difficulty': array([0], dtype=int32), 'num_points_in_gt': (n), } } ] """ def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ self.data_infos = mmcv.load(ann_file) self.cat2label = { cat_name: i for i, cat_name in enumerate(self.CLASSES) } return self.data_infos def _filter_imgs(self, min_size=32): """Filter images without ground truths.""" valid_inds = [] for i, img_info in enumerate(self.data_infos): if len(img_info['annos']['name']) > 0: valid_inds.append(i) return valid_inds def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - bboxes (np.ndarray): Ground truth bboxes. - labels (np.ndarray): Labels of ground truths. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] annos = info['annos'] gt_names = annos['name'] gt_bboxes = annos['bbox'] difficulty = annos['difficulty'] # remove classes that is not needed selected = self.keep_arrays_by_name(gt_names, self.CLASSES) gt_bboxes = gt_bboxes[selected] gt_names = gt_names[selected] difficulty = difficulty[selected] gt_labels = np.array([self.cat2label[n] for n in gt_names]) anns_results = dict( bboxes=gt_bboxes.astype(np.float32), labels=gt_labels, ) return anns_results def prepare_train_img(self, idx): """Training image preparation. Args: index (int): Index for accessing the target image data. Returns: dict: Training image data dict after preprocessing corresponding to the index. """ img_raw_info = self.data_infos[idx]['image'] img_info = dict(filename=img_raw_info['image_path']) ann_info = self.get_ann_info(idx) if len(ann_info['bboxes']) == 0: return None results = dict(img_info=img_info, ann_info=ann_info) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_pipeline(results) return self.pipeline(results) def prepare_test_img(self, idx): """Prepare data for testing. Args: index (int): Index for accessing the target image data. Returns: dict: Testing image data dict after preprocessing corresponding to the index. """ img_raw_info = self.data_infos[idx]['image'] img_info = dict(filename=img_raw_info['image_path']) results = dict(img_info=img_info) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_pipeline(results) return self.pipeline(results) def drop_arrays_by_name(self, gt_names, used_classes): """Drop irrelevant ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be dropped. """ inds = [i for i, x in enumerate(gt_names) if x not in used_classes] inds = np.array(inds, dtype=np.int64) return inds def keep_arrays_by_name(self, gt_names, used_classes): """Keep useful ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be keeped. """ inds = [i for i, x in enumerate(gt_names) if x in used_classes] inds = np.array(inds, dtype=np.int64) return inds def reformat_bbox(self, outputs, out=None): """Reformat bounding boxes to KITTI 2D styles. Args: outputs (list[np.ndarray]): List of arrays storing the inferenced bounding boxes and scores. out (str | None): The prefix of output file. Default: None. Returns: list[dict]: A list of dictionaries with the kitti 2D format. """ from mmdet3d.core.bbox.transforms import bbox2result_kitti2d sample_idx = [info['image']['image_idx'] for info in self.data_infos] result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx, out) return result_files def evaluate(self, result_files, eval_types=None): """Evaluation in KITTI protocol. Args: result_files (str): Path of result files. eval_types (str): Types of evaluation. Default: None. KITTI dataset only support 'bbox' evaluation type. Returns: tuple (str, dict): Average precision results in str format and average precision results in dict format. """ from mmdet3d.core.evaluation import kitti_eval eval_types = ['bbox'] if not eval_types else eval_types assert eval_types in ('bbox', ['bbox' ]), 'KITTI data set only evaluate bbox' gt_annos = [info['annos'] for info in self.data_infos] ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bbox']) return ap_result_str, ap_dict ================================================ FILE: mmdet3d/datasets/kitti_dataset.py ================================================ import copy import mmcv import numpy as np import os import tempfile import torch from mmcv.utils import print_log from os import path as osp from mmdet.datasets import DATASETS from ..core import show_result from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, points_cam2img) from .custom_3d import Custom3DDataset @DATASETS.register_module() class KittiDataset(Custom3DDataset): r"""KITTI Dataset. This class serves as the API for experiments on the `KITTI Dataset `_. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. split (str): Split of input data. pts_prefix (str, optional): Prefix of points files. Defaults to 'velodyne'. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. pcd_limit_range (list): The range of point cloud used to filter invalid predicted boxes. Default: [0, -40, -3, 70.4, 40, 0.0]. """ CLASSES = ('car', 'pedestrian', 'cyclist') def __init__(self, data_root, ann_file, split, pts_prefix='velodyne', pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, pcd_limit_range=[0, -40, -3, 70.4, 40, 0.0]): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) self.split = split self.root_split = os.path.join(self.data_root, split) assert self.modality is not None self.pcd_limit_range = pcd_limit_range self.pts_prefix = pts_prefix def _get_pts_filename(self, idx): """Get point cloud filename according to the given index. Args: index (int): Index of the point cloud file to get. Returns: str: Name of the point cloud file. """ pts_filename = osp.join(self.root_split, self.pts_prefix, f'{idx:06d}.bin') return pts_filename def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - img_prefix (str | None): Prefix of image files. - img_info (dict): Image info. - lidar2img (list[np.ndarray], optional): Transformations \ from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['image']['image_idx'] img_filename = os.path.join(self.data_root, info['image']['image_path']) # TODO: consider use torch.Tensor only rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) lidar2img = P2 @ rect @ Trv2c pts_filename = self._get_pts_filename(sample_idx) input_dict = dict( sample_idx=sample_idx, pts_filename=pts_filename, img_prefix=None, img_info=dict(filename=img_filename), lidar2img=lidar2img) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_bboxes (np.ndarray): 2D ground truth bboxes. - gt_labels (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) annos = info['annos'] # we need other objects to avoid collision when sample annos = self.remove_dontcare(annos) loc = annos['location'] dims = annos['dimensions'] rots = annos['rotation_y'] gt_names = annos['name'] gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1).astype(np.float32) # convert gt_bboxes_3d to velodyne coordinates gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to( self.box_mode_3d, np.linalg.inv(rect @ Trv2c)) gt_bboxes = annos['bbox'] selected = self.drop_arrays_by_name(gt_names, ['DontCare']) gt_bboxes = gt_bboxes[selected].astype('float32') gt_names = gt_names[selected] gt_labels = [] for cat in gt_names: if cat in self.CLASSES: gt_labels.append(self.CLASSES.index(cat)) else: gt_labels.append(-1) gt_labels = np.array(gt_labels).astype(np.int64) gt_labels_3d = copy.deepcopy(gt_labels) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, bboxes=gt_bboxes, labels=gt_labels, gt_names=gt_names) return anns_results def drop_arrays_by_name(self, gt_names, used_classes): """Drop irrelevant ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be dropped. """ inds = [i for i, x in enumerate(gt_names) if x not in used_classes] inds = np.array(inds, dtype=np.int64) return inds def keep_arrays_by_name(self, gt_names, used_classes): """Keep useful ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be keeped. """ inds = [i for i, x in enumerate(gt_names) if x in used_classes] inds = np.array(inds, dtype=np.int64) return inds def remove_dontcare(self, ann_info): """Remove annotations that do not need to be cared. Args: ann_info (dict): Dict of annotation infos. The ``'DontCare'`` annotations will be removed according to ann_file['name']. Returns: dict: Annotations after filtering. """ img_filtered_annotations = {} relevant_annotation_indices = [ i for i, x in enumerate(ann_info['name']) if x != 'DontCare' ] for key in ann_info.keys(): img_filtered_annotations[key] = ( ann_info[key][relevant_annotation_indices]) return img_filtered_annotations def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str | None): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str | None): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing \ the json filepaths, tmp_dir is the temporal directory created \ for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None if not isinstance(outputs[0], dict): result_files = self.bbox2result_kitti2d(outputs, self.CLASSES, pklfile_prefix, submission_prefix) elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]: result_files = dict() for name in outputs[0]: results_ = [out[name] for out in outputs] pklfile_prefix_ = pklfile_prefix + name if submission_prefix is not None: submission_prefix_ = submission_prefix + name else: submission_prefix_ = None if 'img' in name: result_files = self.bbox2result_kitti2d( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) else: result_files_ = self.bbox2result_kitti( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) result_files[name] = result_files_ else: result_files = self.bbox2result_kitti(outputs, self.CLASSES, pklfile_prefix, submission_prefix) return result_files, tmp_dir def evaluate(self, results, metric=None, logger=None, pklfile_prefix=None, submission_prefix=None, show=False, out_dir=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. pklfile_prefix (str | None): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str | None): The prefix of submission datas. If not specified, the submission data will not be generated. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, pklfile_prefix) from mmdet3d.core.evaluation import kitti_eval gt_annos = [info['annos'] for info in self.data_infos] if isinstance(result_files, dict): ap_dict = dict() for name, result_files_ in result_files.items(): eval_types = ['bbox', 'bev', '3d'] if 'img' in name: eval_types = ['bbox'] ap_result_str, ap_dict_ = kitti_eval( gt_annos, result_files_, self.CLASSES, eval_types=eval_types) for ap_type, ap in ap_dict_.items(): ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap)) print_log( f'Results of {name}:\n' + ap_result_str, logger=logger) else: if metric == 'img_bbox': ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bbox']) else: ap_result_str, ap_dict = kitti_eval(gt_annos, result_files, self.CLASSES) print_log('\n' + ap_result_str, logger=logger) if tmp_dir is not None: tmp_dir.cleanup() if show: self.show(results, out_dir) return ap_dict def bbox2result_kitti(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 3D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the \ inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str | None): The prefix of pkl file. submission_prefix (str | None): The prefix of submission file. Returns: list[dict]: A list of dictionaries with the kitti format. """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' if submission_prefix is not None: mmcv.mkdir_or_exist(submission_prefix) det_annos = [] print('\nConverting prediction to KITTI format') for idx, pred_dicts in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] info = self.data_infos[idx] sample_idx = info['image']['image_idx'] image_shape = info['image']['image_shape'][:2] box_dict = self.convert_valid_bboxes(pred_dicts, info) anno = { 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [], 'score': [] } if len(box_dict['bbox']) > 0: box_2d_preds = box_dict['bbox'] box_preds = box_dict['box3d_camera'] scores = box_dict['scores'] box_preds_lidar = box_dict['box3d_lidar'] label_preds = box_dict['label_preds'] for box, box_lidar, bbox, score, label in zip( box_preds, box_preds_lidar, box_2d_preds, scores, label_preds): bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) bbox[:2] = np.maximum(bbox[:2], [0, 0]) anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append( -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6]) anno['bbox'].append(bbox) anno['dimensions'].append(box[3:6]) anno['location'].append(box[:3]) anno['rotation_y'].append(box[6]) anno['score'].append(score) anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) else: anno = { 'name': np.array([]), 'truncated': np.array([]), 'occluded': np.array([]), 'alpha': np.array([]), 'bbox': np.zeros([0, 4]), 'dimensions': np.zeros([0, 3]), 'location': np.zeros([0, 3]), 'rotation_y': np.array([]), 'score': np.array([]), } annos.append(anno) if submission_prefix is not None: curr_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(curr_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format( anno['name'][idx], anno['alpha'][idx], bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3], dims[idx][1], dims[idx][2], dims[idx][0], loc[idx][0], loc[idx][1], loc[idx][2], anno['rotation_y'][idx], anno['score'][idx]), file=f) annos[-1]['sample_idx'] = np.array( [sample_idx] * len(annos[-1]['score']), dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print(f'Result is saved to {out}.') return det_annos def bbox2result_kitti2d(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 2D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the \ inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str | None): The prefix of pkl file. submission_prefix (str | None): The prefix of submission file. Returns: list[dict]: A list of dictionaries have the kitti format """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' det_annos = [] print('\nConverting prediction to KITTI format') for i, bboxes_per_sample in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] anno = dict( name=[], truncated=[], occluded=[], alpha=[], bbox=[], dimensions=[], location=[], rotation_y=[], score=[]) sample_idx = self.data_infos[i]['image']['image_idx'] num_example = 0 for label in range(len(bboxes_per_sample)): bbox = bboxes_per_sample[label] for i in range(bbox.shape[0]): anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append(0.0) anno['bbox'].append(bbox[i, :4]) # set dimensions (height, width, length) to zero anno['dimensions'].append( np.zeros(shape=[3], dtype=np.float32)) # set the 3D translation to (-1000, -1000, -1000) anno['location'].append( np.ones(shape=[3], dtype=np.float32) * (-1000.0)) anno['rotation_y'].append(0.0) anno['score'].append(bbox[i, 4]) num_example += 1 if num_example == 0: annos.append( dict( name=np.array([]), truncated=np.array([]), occluded=np.array([]), alpha=np.array([]), bbox=np.zeros([0, 4]), dimensions=np.zeros([0, 3]), location=np.zeros([0, 3]), rotation_y=np.array([]), score=np.array([]), )) else: anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) annos[-1]['sample_idx'] = np.array( [sample_idx] * num_example, dtype=np.int64) det_annos += annos if pklfile_prefix is not None: # save file in pkl format pklfile_path = ( pklfile_prefix[:-4] if pklfile_prefix.endswith( ('.pkl', '.pickle')) else pklfile_prefix) mmcv.dump(det_annos, pklfile_path) if submission_prefix is not None: # save file in submission format mmcv.mkdir_or_exist(submission_prefix) print(f'Saving KITTI submission to {submission_prefix}') for i, anno in enumerate(det_annos): sample_idx = self.data_infos[i]['image']['image_idx'] cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(cur_det_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'][::-1] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} ' '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format( anno['name'][idx], anno['alpha'][idx], *bbox[idx], # 4 float *dims[idx], # 3 float *loc[idx], # 3 float anno['rotation_y'][idx], anno['score'][idx]), file=f, ) print('Result is saved to {}'.format(submission_prefix)) return det_annos def convert_valid_bboxes(self, box_dict, info): """Convert the predicted boxes into valid ones. Args: box_dict (dict): Box dictionaries to be converted. - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes. - scores_3d (torch.Tensor): Scores of boxes. - labels_3d (torch.Tensor): Class labels of boxes. info (dict): Data info. Returns: dict: Valid predicted boxes. - bbox (np.ndarray): 2D bounding boxes. - box3d_camera (np.ndarray): 3D bounding boxes in \ camera coordinate. - box3d_lidar (np.ndarray): 3D bounding boxes in \ LiDAR coordinate. - scores (np.ndarray): Scores of boxes. - label_preds (np.ndarray): Class label predictions. - sample_idx (int): Sample index. """ # TODO: refactor this function box_preds = box_dict['boxes_3d'] scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] # TODO: remove the hack of yaw box_preds.tensor[:, -1] = box_preds.tensor[:, -1] - np.pi box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) img_shape = info['image']['image_shape'] P2 = box_preds.tensor.new_tensor(P2) box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c) box_corners = box_preds_camera.corners box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] box_2d_preds = torch.cat([minxy, maxxy], dim=1) # Post-processing # check box_preds_camera image_shape = box_preds.tensor.new_tensor(img_shape) valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) & (box_2d_preds[:, 1] < image_shape[0]) & (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0)) # check box_preds limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range) valid_pcd_inds = ((box_preds.center > limit_range[:3]) & (box_preds.center < limit_range[3:])) valid_inds = valid_cam_inds & valid_pcd_inds.all(-1) if valid_inds.sum() > 0: return dict( bbox=box_2d_preds[valid_inds, :].numpy(), box3d_camera=box_preds_camera[valid_inds].tensor.numpy(), box3d_lidar=box_preds[valid_inds].tensor.numpy(), scores=scores[valid_inds].numpy(), label_preds=labels[valid_inds].numpy(), sample_idx=sample_idx, ) else: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx, ) def show(self, results, out_dir, show=True): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. """ assert out_dir is not None, 'Expect out_dir, got none.' for i, result in enumerate(results): example = self.prepare_test_data(i) data_info = self.data_infos[i] pts_path = data_info['point_cloud']['velodyne_path'] file_name = osp.split(pts_path)[-1].split('.')[0] # for now we convert points into depth mode points = example['points'][0]._data.numpy() points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['boxes_3d'].tensor.numpy() pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name, show) ================================================ FILE: mmdet3d/datasets/lyft_dataset.py ================================================ import mmcv import numpy as np import pandas as pd import tempfile from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from lyft_dataset_sdk.utils.data_classes import Box as LyftBox from os import path as osp from pyquaternion import Quaternion from mmdet3d.core.evaluation.lyft_eval import lyft_eval from mmdet.datasets import DATASETS from ..core import show_result from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes from .custom_3d import Custom3DDataset @DATASETS.register_module() class LyftDataset(Custom3DDataset): r"""Lyft Dataset. This class serves as the API for experiments on the Lyft Dataset. Please refer to ``_ # noqa for data downloading. Args: ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. data_root (str): Path of dataset root. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ NameMapping = { 'bicycle': 'bicycle', 'bus': 'bus', 'car': 'car', 'emergency_vehicle': 'emergency_vehicle', 'motorcycle': 'motorcycle', 'other_vehicle': 'other_vehicle', 'pedestrian': 'pedestrian', 'truck': 'truck', 'animal': 'animal' } DefaultAttribute = { 'car': 'is_stationary', 'truck': 'is_stationary', 'bus': 'is_stationary', 'emergency_vehicle': 'is_stationary', 'other_vehicle': 'is_stationary', 'motorcycle': 'is_stationary', 'bicycle': 'is_stationary', 'pedestrian': 'is_stationary', 'animal': 'is_stationary' } CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal') def __init__(self, ann_file, pipeline=None, data_root=None, classes=None, load_interval=1, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False): self.load_interval = load_interval super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) if self.modality is None: self.modality = dict( use_camera=False, use_lidar=True, use_radar=False, use_map=False, use_external=False, ) def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations sorted by timestamps. """ data = mmcv.load(ann_file) data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp'])) data_infos = data_infos[::self.load_interval] self.metadata = data['metadata'] self.version = self.metadata['version'] return data_infos def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): sample index - pts_filename (str): filename of point clouds - sweeps (list[dict]): infos of sweeps - timestamp (float): sample timestamp - img_filename (str, optional): image filename - lidar2img (list[np.ndarray], optional): transformations \ from lidar to different cameras - ann_info (dict): annotation info """ info = self.data_infos[index] # standard protocal modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], timestamp=info['timestamp'] / 1e6, ) if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] for cam_type, cam_info in info['cams'].items(): image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ info = self.data_infos[index] gt_bboxes_3d = info['gt_boxes'] gt_names_3d = info['gt_names'] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) if 'gt_shape' in info: gt_shape = info['gt_shape'] gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1) # the lyft box center is [0.5, 0.5, 0.5], we change it to be # the same as KITTI (0.5, 0.5, 0) gt_bboxes_3d = LiDARInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, ) return anns_results def _format_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ lyft_annos = {} mapped_class_names = self.CLASSES print('Start to convert detection format...') for sample_id, det in enumerate(mmcv.track_iter_progress(results)): annos = [] boxes = output_to_lyft_box(det) sample_token = self.data_infos[sample_id]['token'] boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes) for i, box in enumerate(boxes): name = mapped_class_names[box.label] lyft_anno = dict( sample_token=sample_token, translation=box.center.tolist(), size=box.wlh.tolist(), rotation=box.orientation.elements.tolist(), name=name, score=box.score) annos.append(lyft_anno) lyft_annos[sample_token] = annos lyft_submissions = { 'meta': self.modality, 'results': lyft_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_lyft.json') print('Results writes to', res_path) mmcv.dump(lyft_submissions, res_path) return res_path def _evaluate_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in Lyft protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. metric (str): Metric name used for evaluation. Default: 'bbox'. result_name (str): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ output_dir = osp.join(*osp.split(result_path)[:-1]) lyft = Lyft( data_path=osp.join(self.data_root, self.version), json_path=osp.join(self.data_root, self.version, self.version), verbose=True) eval_set_map = { 'v1.01-train': 'val', } metrics = lyft_eval(lyft, self.data_root, result_path, eval_set_map[self.version], output_dir, logger) # record metrics detail = dict() metric_prefix = f'{result_name}_Lyft' for i, name in enumerate(metrics['class_names']): AP = float(metrics['mAPs_cate'][i]) detail[f'{metric_prefix}/{name}_AP'] = AP detail[f'{metric_prefix}/mAP'] = metrics['Final mAP'] return detail def format_results(self, results, jsonfile_prefix=None, csv_savepath=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. csv_savepath (str | None): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a \ dict containing the json filepaths, `tmp_dir` is the temporal \ directory created for saving json files when \ `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None if not isinstance(results[0], dict): result_files = self._format_bbox(results, jsonfile_prefix) else: result_files = dict() for name in results[0]: print(f'\nFormating bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_bbox(results_, tmp_file_)}) if csv_savepath is not None: self.json2csv(result_files['pts_bbox'], csv_savepath) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, jsonfile_prefix=None, csv_savepath=None, result_names=['pts_bbox'], show=False, out_dir=None): """Evaluation in Lyft protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. csv_savepath (str | None): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. Returns: dict[str, float]: Evaluation results. """ result_files, tmp_dir = self.format_results(results, jsonfile_prefix, csv_savepath) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print(f'Evaluating bboxes of {name}') ret_dict = self._evaluate_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_single(result_files) if tmp_dir is not None: tmp_dir.cleanup() if show: self.show(results, out_dir) return results_dict def show(self, results, out_dir): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. """ for i, result in enumerate(results): example = self.prepare_test_data(i) points = example['points'][0]._data.numpy() data_info = self.data_infos[i] pts_path = data_info['lidar_path'] file_name = osp.split(pts_path)[-1].split('.')[0] # for now we convert points into depth mode points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) inds = result['pts_bbox']['scores_3d'] > 0.1 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy() pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name) def json2csv(self, json_path, csv_savepath): """Convert the json file to csv format for submission. Args: json_path (str): Path of the result json file. csv_savepath (str): Path to save the csv file. """ results = mmcv.load(json_path)['results'] sample_list_path = osp.join(self.data_root, 'sample_submission.csv') data = pd.read_csv(sample_list_path) Id_list = list(data['Id']) pred_list = list(data['PredictionString']) cnt = 0 print('Converting the json to csv...') for token in results.keys(): cnt += 1 predictions = results[token] prediction_str = '' for i in range(len(predictions)): prediction_str += \ str(predictions[i]['score']) + ' ' + \ str(predictions[i]['translation'][0]) + ' ' + \ str(predictions[i]['translation'][1]) + ' ' + \ str(predictions[i]['translation'][2]) + ' ' + \ str(predictions[i]['size'][0]) + ' ' + \ str(predictions[i]['size'][1]) + ' ' + \ str(predictions[i]['size'][2]) + ' ' + \ str(Quaternion(list(predictions[i]['rotation'])) .yaw_pitch_roll[0]) + ' ' + \ predictions[i]['name'] + ' ' prediction_str = prediction_str[:-1] idx = Id_list.index(token) pred_list[idx] = prediction_str df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list}) df.to_csv(csv_savepath, index=False) def output_to_lyft_box(detection): """Convert the output to the box class in the Lyft. Args: detection (dict): Detection results. Returns: list[:obj:`LyftBox`]: List of standard LyftBoxes. """ box3d = detection['boxes_3d'] scores = detection['scores_3d'].numpy() labels = detection['labels_3d'].numpy() box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() # TODO: check whether this is necessary # with dir_offset & dir_limit in the head box_yaw = -box_yaw - np.pi / 2 box_list = [] for i in range(len(box3d)): quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) box = LyftBox( box_gravity_center[i], box_dims[i], quat, label=labels[i], score=scores[i]) box_list.append(box) return box_list def lidar_lyft_box_to_global(info, boxes): """Convert the box from ego to global coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes. Returns: list: List of standard LyftBoxes in the global coordinate. """ box_list = [] for box in boxes: # Move box to ego vehicle coord system box.rotate(Quaternion(info['lidar2ego_rotation'])) box.translate(np.array(info['lidar2ego_translation'])) # Move box to global coord system box.rotate(Quaternion(info['ego2global_rotation'])) box.translate(np.array(info['ego2global_translation'])) box_list.append(box) return box_list ================================================ FILE: mmdet3d/datasets/nuscenes_dataset.py ================================================ import mmcv import numpy as np import pyquaternion import tempfile from nuscenes.utils.data_classes import Box as NuScenesBox from os import path as osp from mmdet.datasets import DATASETS from ..core import show_result from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes from .custom_3d import Custom3DDataset @DATASETS.register_module() class NuScenesDataset(Custom3DDataset): r"""NuScenes Dataset. This class serves as the API for experiments on the NuScenes Dataset. Please refer to `NuScenes Dataset `_ for data downloading. Args: ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. data_root (str): Path of dataset root. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. with_velocity (bool, optional): Whether include velocity prediction into the experiments. Defaults to True. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes. - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. eval_version (bool, optional): Configuration version of evaluation. Defaults to 'detection_cvpr_2019'. use_valid_flag (bool): Whether to use `use_valid_flag` key in the info file as mask to filter gt_boxes and gt_names. Defaults to False. """ NameMapping = { 'movable_object.barrier': 'barrier', 'vehicle.bicycle': 'bicycle', 'vehicle.bus.bendy': 'bus', 'vehicle.bus.rigid': 'bus', 'vehicle.car': 'car', 'vehicle.construction': 'construction_vehicle', 'vehicle.motorcycle': 'motorcycle', 'human.pedestrian.adult': 'pedestrian', 'human.pedestrian.child': 'pedestrian', 'human.pedestrian.construction_worker': 'pedestrian', 'human.pedestrian.police_officer': 'pedestrian', 'movable_object.trafficcone': 'traffic_cone', 'vehicle.trailer': 'trailer', 'vehicle.truck': 'truck' } DefaultAttribute = { 'car': 'vehicle.parked', 'pedestrian': 'pedestrian.moving', 'trailer': 'vehicle.parked', 'truck': 'vehicle.parked', 'bus': 'vehicle.moving', 'motorcycle': 'cycle.without_rider', 'construction_vehicle': 'vehicle.parked', 'bicycle': 'cycle.without_rider', 'barrier': '', 'traffic_cone': '', } AttrMapping = { 'cycle.with_rider': 0, 'cycle.without_rider': 1, 'pedestrian.moving': 2, 'pedestrian.standing': 3, 'pedestrian.sitting_lying_down': 4, 'vehicle.moving': 5, 'vehicle.parked': 6, 'vehicle.stopped': 7, } AttrMapping_rev = [ 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', ] CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') def __init__(self, ann_file, num_views=6, pipeline=None, data_root=None, classes=None, load_interval=1, with_velocity=True, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, eval_version='detection_cvpr_2019', use_valid_flag=False): self.load_interval = load_interval self.use_valid_flag = use_valid_flag super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) self.num_views = num_views assert self.num_views <= 6 self.with_velocity = with_velocity self.eval_version = eval_version from nuscenes.eval.detection.config import config_factory self.eval_detection_configs = config_factory(self.eval_version) if self.modality is None: self.modality = dict( use_camera=False, use_lidar=True, use_radar=False, use_map=False, use_external=False, ) def get_cat_ids(self, idx): """Get category distribution of single scene. Args: idx (int): Index of the data_info. Returns: dict[list]: for each category, if the current scene contains such boxes, store a list containing idx, otherwise, store empty list. """ info = self.data_infos[idx] if self.use_valid_flag: mask = info['valid_flag'] gt_names = set(info['gt_names'][mask]) else: gt_names = set(info['gt_names']) cat_ids = [] for name in gt_names: if name in self.CLASSES: cat_ids.append(self.cat2id[name]) return cat_ids def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations sorted by timestamps. """ data = mmcv.load(ann_file) data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp'])) data_infos = data_infos[::self.load_interval] self.metadata = data['metadata'] self.version = self.metadata['version'] return data_infos def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - sweeps (list[dict]): Infos of sweeps. - timestamp (float): Sample timestamp. - img_filename (str, optional): Image filename. - lidar2img (list[np.ndarray], optional): Transformations \ from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] # standard protocal modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], timestamp=info['timestamp'] / 1e6, ) cam_orders = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT'] if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] # for cam_type, cam_info in info['cams'].items(): intrinsics = [] lidar2cam_rs = [] lidar2cam_ts = [] for cam_type in cam_orders: cam_info = info['cams'][cam_type] image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) intrinsics.append(intrinsic) lidar2cam_rs.append(lidar2cam_r) lidar2cam_ts.append(lidar2cam_t) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, cam_intrinsic=intrinsics, lidar2cam_r=lidar2cam_rs, lidar2cam_t=lidar2cam_ts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ info = self.data_infos[index] # filter out bbox containing no points if self.use_valid_flag: mask = info['valid_flag'] else: mask = info['num_lidar_pts'] > 0 gt_bboxes_3d = info['gt_boxes'][mask] gt_names_3d = info['gt_names'][mask] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) if self.with_velocity: gt_velocity = info['gt_velocity'][mask] nan_mask = np.isnan(gt_velocity[:, 0]) gt_velocity[nan_mask] = [0.0, 0.0] gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1) # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be # the same as KITTI (0.5, 0.5, 0) gt_bboxes_3d = LiDARInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, gt_names=gt_names_3d) return anns_results def _format_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ nusc_annos = {} mapped_class_names = self.CLASSES print('Start to convert detection format...') for sample_id, det in enumerate(mmcv.track_iter_progress(results)): annos = [] boxes = output_to_nusc_box(det) sample_token = self.data_infos[sample_id]['token'] boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes, mapped_class_names, self.eval_detection_configs, self.eval_version) for i, box in enumerate(boxes): name = mapped_class_names[box.label] if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2: if name in [ 'car', 'construction_vehicle', 'bus', 'truck', 'trailer', ]: attr = 'vehicle.moving' elif name in ['bicycle', 'motorcycle']: attr = 'cycle.with_rider' else: attr = NuScenesDataset.DefaultAttribute[name] else: if name in ['pedestrian']: attr = 'pedestrian.standing' elif name in ['bus']: attr = 'vehicle.stopped' else: attr = NuScenesDataset.DefaultAttribute[name] nusc_anno = dict( sample_token=sample_token, translation=box.center.tolist(), size=box.wlh.tolist(), rotation=box.orientation.elements.tolist(), velocity=box.velocity[:2].tolist(), detection_name=name, detection_score=box.score, attribute_name=attr) annos.append(nusc_anno) nusc_annos[sample_token] = annos nusc_submissions = { 'meta': self.modality, 'results': nusc_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_nusc.json') print('Results writes to', res_path) mmcv.dump(nusc_submissions, res_path) return res_path def _evaluate_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. metric (str): Metric name used for evaluation. Default: 'bbox'. result_name (str): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ from nuscenes import NuScenes from nuscenes.eval.detection.evaluate import NuScenesEval output_dir = osp.join(*osp.split(result_path)[:-1]) nusc = NuScenes( version=self.version, dataroot=self.data_root, verbose=False) eval_set_map = { 'v1.0-mini': 'mini_val', 'v1.0-trainval': 'val', } nusc_eval = NuScenesEval( nusc, config=self.eval_detection_configs, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=False) nusc_eval.main(render_curves=False) # record metrics metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) detail = dict() metric_prefix = f'{result_name}_NuScenes' for name in self.CLASSES: for k, v in metrics['label_aps'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val for k, v in metrics['label_tp_errors'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_{}'.format(metric_prefix, name, k)] = val detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] return detail def format_results(self, results, jsonfile_prefix=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a \ dict containing the json filepaths, `tmp_dir` is the temporal \ directory created for saving json files when \ `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None if not isinstance(results[0], dict): result_files = self._format_bbox(results, jsonfile_prefix) else: result_files = dict() for name in results[0]: print(f'\nFormating bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_bbox(results_, tmp_file_)}) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, jsonfile_prefix=None, result_names=['pts_bbox'], show=False, out_dir=None): """Evaluation in nuScenes protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, jsonfile_prefix) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print('Evaluating bboxes of {}'.format(name)) ret_dict = self._evaluate_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_single(result_files) if tmp_dir is not None: tmp_dir.cleanup() if show: self.show(results, out_dir) return results_dict def show(self, results, out_dir): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. """ for i, result in enumerate(results): example = self.prepare_test_data(i) points = example['points'][0]._data.numpy() data_info = self.data_infos[i] pts_path = data_info['lidar_path'] file_name = osp.split(pts_path)[-1].split('.')[0] # for now we convert points into depth mode points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) inds = result['pts_bbox']['scores_3d'] > 0.1 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy() pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name) def output_to_nusc_box(detection): """Convert the output to the box class in the nuScenes. Args: detection (dict): Detection results. - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - scores_3d (torch.Tensor): Detection scores. - labels_3d (torch.Tensor): Predicted box labels. Returns: list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. """ box3d = detection['boxes_3d'] scores = detection['scores_3d'].numpy() labels = detection['labels_3d'].numpy() box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() # TODO: check whether this is necessary # with dir_offset & dir_limit in the head box_yaw = -box_yaw - np.pi / 2 box_list = [] for i in range(len(box3d)): quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) velocity = (*box3d.tensor[i, 7:9], 0.0) # velo_val = np.linalg.norm(box3d[i, 7:9]) # velo_ori = box3d[i, 6] # velocity = ( # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0) box = NuScenesBox( box_gravity_center[i], box_dims[i], quat, label=labels[i], score=scores[i], velocity=velocity) box_list.append(box) return box_list def lidar_nusc_box_to_global(info, boxes, classes, eval_configs, eval_version='detection_cvpr_2019'): """Convert the box from ego to global coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. eval_version (str): Evaluation version. Default: 'detection_cvpr_2019' Returns: list: List of standard NuScenesBoxes in the global coordinate. """ box_list = [] for box in boxes: # Move box to ego vehicle coord system box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation'])) box.translate(np.array(info['lidar2ego_translation'])) # filter det in ego. cls_range_map = eval_configs.class_range radius = np.linalg.norm(box.center[:2], 2) det_range = cls_range_map[classes[box.label]] if radius > det_range: continue # Move box to global coord system box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) box.translate(np.array(info['ego2global_translation'])) box_list.append(box) return box_list ================================================ FILE: mmdet3d/datasets/nuscenes_dataset_viewInfo.py ================================================ import mmcv import numpy as np import pyquaternion import tempfile from nuscenes.utils.data_classes import Box as NuScenesBox from os import path as osp from mmdet.datasets import DATASETS from ..core import show_result from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes, CameraInstance3DBoxes from .nuscenes_dataset import NuScenesDataset @DATASETS.register_module() class NuScenesDataset_ViewInfo(NuScenesDataset): """ Compared with NuScenesDataset, we also load 2d annotations """ def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: """ info = self.data_infos[index] # filter out bbox containing no points if self.use_valid_flag: mask = info['valid_flag'] else: mask = info['num_lidar_pts'] > 0 gt_bboxes_3d = info['gt_boxes'][mask] gt_names_3d = info['gt_names'][mask] gt_visible_3d = info['gt_visible'][mask] # .copy() cannot be missed! gt_bboxes2d_view = info['gt_bboxes2d_view'].copy() gt_bboxes2d_view[..., :2] = gt_bboxes2d_view[..., :2] + gt_bboxes2d_view[..., 2:4] / 2 gt_bboxes_lidar_view = info['gt_bboxes_lidar_view'].copy() gt_names2d_view = info['gt_names2d_view'] gt_viewsIDs = info['gt_viewsIDs'] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) gt_labels2d_view = [] for cat in gt_names2d_view: if cat in self.CLASSES: gt_labels2d_view.append(self.CLASSES.index(cat)) else: gt_labels2d_view.append(-1) gt_labels2d_view = np.array(gt_labels2d_view) gt_labels2d_view = np.stack([gt_labels2d_view, gt_viewsIDs], axis=-1) gt_bboxes_cam_view = info['gt_bboxes_cam_view'].copy() if self.with_velocity: gt_velocity = info['gt_velocity'][mask].copy() nan_mask = np.isnan(gt_velocity[:, 0]) gt_velocity[nan_mask] = [0.0, 0.0] gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1) gt_cam_vel = info['gt_velocity_cam_view'].copy() nan_mask_cam = np.isnan(gt_cam_vel[:, 0]) gt_cam_vel[nan_mask_cam] = [0.0, 0.0] gt_bboxes_cam_view = np.concatenate([gt_bboxes_cam_view, gt_cam_vel], axis=-1) gt_lidar_vel =info['gt_velocity_lidar_view'].copy() nan_mask_lidar = np.isnan(gt_lidar_vel[:, 0]) gt_lidar_vel[nan_mask_lidar] = [0.0, 0.0] gt_bboxes_lidar_view = np.concatenate([gt_bboxes_lidar_view, gt_lidar_vel], axis=-1) gt_bboxes_cam_view = CameraInstance3DBoxes( gt_bboxes_cam_view, box_dim=gt_bboxes_cam_view.shape[-1], origin=(0.5, 0.5, 0.5) ) gt_bboxes_lidar_view = LiDARInstance3DBoxes( gt_bboxes_lidar_view, box_dim=gt_bboxes_lidar_view.shape[-1], origin=(0.5, 0.5, 0.5) ).convert_to(self.box_mode_3d) # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be # the same as KITTI (0.5, 0.5, 0) gt_bboxes_3d = LiDARInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) if "gt_pts_centers_view" in info: gt_pts_centers_view = info['gt_pts_centers_view'].copy() gt_img_centers_view = info['gt_img_centers_view'].copy() anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, gt_visible_3d=gt_visible_3d, gt_names=gt_names_3d, bboxes=gt_bboxes2d_view, labels=gt_labels2d_view, pts_centers_view=gt_pts_centers_view, img_centers_view=gt_img_centers_view, bboxes_cam_view=gt_bboxes_cam_view, bboxes_lidar_view=gt_bboxes_lidar_view, ) else: anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, gt_visible_3d=gt_visible_3d, gt_names=gt_names_3d, bboxes=gt_bboxes2d_view, labels=gt_labels2d_view, ) return anns_results def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - sweeps (list[dict]): Infos of sweeps. - timestamp (float): Sample timestamp. - img_filename (str, optional): Image filename. - lidar2img (list[np.ndarray], optional): Transformations \ from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] # standard protocal modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], timestamp=info['timestamp'] / 1e6, ) cam_orders = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT'] if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] # for cam_type, cam_info in info['cams'].items(): intrinsics = [] lidar2cam_rs = [] lidar2cam_ts = [] for cam_type in cam_orders: cam_info = info['cams'][cam_type] image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt.copy()) intrinsics.append(intrinsic.copy()) lidar2cam_rs.append(lidar2cam_r.copy()) lidar2cam_ts.append(-lidar2cam_t.copy()) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, cam_intrinsic=intrinsics, lidar2cam_r=lidar2cam_rs, lidar2cam_t=lidar2cam_ts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict ================================================ FILE: mmdet3d/datasets/pipelines/__init__.py ================================================ from mmdet.datasets.pipelines import Compose from .dbsampler import DataBaseSampler from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D from .loading import (LoadAnnotations3D, LoadMultiViewImageFromFiles, LoadPointsFromFile, LoadPointsFromMultiSweeps, NormalizePointsColor, PointSegClassMapping, MyLoadAnnotations3D) from .test_time_aug import MultiScaleFlipAug3D from .transforms_3d import (BackgroundPointsFilter, GlobalRotScaleTrans, IndoorPointSample, ObjectNoise, ObjectRangeFilter, ObjectSample, PointShuffle, PointsRangeFilter, RandomFlip3D, VoxelBasedPointSampler, OurRandomFlip3D, OurGlobalRotScaleTrans, OurObjectRangeFilter) from .transforms_2d import OurRandomAffine, PhotoMetricDistortionMultiViewImage __all__ = [ 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D', 'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile', 'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D', 'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'MyLoadAnnotations3D', 'OurRandomFlip3D', 'OurGlobalRotScaleTrans', 'OurRandomAffine', 'PhotoMetricDistortionMultiViewImage', 'OurObjectRangeFilter' ] ================================================ FILE: mmdet3d/datasets/pipelines/data_augment_utils.py ================================================ import numba import numpy as np import warnings from numba.errors import NumbaPerformanceWarning from mmdet3d.core.bbox import box_np_ops warnings.filterwarnings('ignore', category=NumbaPerformanceWarning) @numba.njit def _rotation_box2d_jit_(corners, angle, rot_mat_T): """Rotate 2D boxes. Args: corners (np.ndarray): Corners of boxes. angle (float): Rotation angle. rot_mat_T (np.ndarray): Transposed rotation matrix. """ rot_sin = np.sin(angle) rot_cos = np.cos(angle) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = -rot_sin rot_mat_T[1, 0] = rot_sin rot_mat_T[1, 1] = rot_cos corners[:] = corners @ rot_mat_T @numba.jit(nopython=True) def box_collision_test(boxes, qboxes, clockwise=True): """Box collision test. Args: boxes (np.ndarray): Corners of current boxes. qboxes (np.ndarray): Boxes to be avoid colliding. clockwise (bool): Whether the corners are in clockwise order. Default: True. """ N = boxes.shape[0] K = qboxes.shape[0] ret = np.zeros((N, K), dtype=np.bool_) slices = np.array([1, 2, 3, 0]) lines_boxes = np.stack((boxes, boxes[:, slices, :]), axis=2) # [N, 4, 2(line), 2(xy)] lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) # vec = np.zeros((2,), dtype=boxes.dtype) boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) for i in range(N): for j in range(K): # calculate standup first iw = ( min(boxes_standup[i, 2], qboxes_standup[j, 2]) - max(boxes_standup[i, 0], qboxes_standup[j, 0])) if iw > 0: ih = ( min(boxes_standup[i, 3], qboxes_standup[j, 3]) - max(boxes_standup[i, 1], qboxes_standup[j, 1])) if ih > 0: for k in range(4): for box_l in range(4): A = lines_boxes[i, k, 0] B = lines_boxes[i, k, 1] C = lines_qboxes[j, box_l, 0] D = lines_qboxes[j, box_l, 1] acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * ( D[0] - A[0]) bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * ( D[0] - B[0]) if acd != bcd: abc = (C[1] - A[1]) * (B[0] - A[0]) > ( B[1] - A[1]) * ( C[0] - A[0]) abd = (D[1] - A[1]) * (B[0] - A[0]) > ( B[1] - A[1]) * ( D[0] - A[0]) if abc != abd: ret[i, j] = True # collision. break if ret[i, j] is True: break if ret[i, j] is False: # now check complete overlap. # box overlap qbox: box_overlap_qbox = True for box_l in range(4): # point l in qboxes for k in range(4): # corner k in boxes vec = boxes[i, k] - boxes[i, (k + 1) % 4] if clockwise: vec = -vec cross = vec[1] * ( boxes[i, k, 0] - qboxes[j, box_l, 0]) cross -= vec[0] * ( boxes[i, k, 1] - qboxes[j, box_l, 1]) if cross >= 0: box_overlap_qbox = False break if box_overlap_qbox is False: break if box_overlap_qbox is False: qbox_overlap_box = True for box_l in range(4): # point box_l in boxes for k in range(4): # corner k in qboxes vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] if clockwise: vec = -vec cross = vec[1] * ( qboxes[j, k, 0] - boxes[i, box_l, 0]) cross -= vec[0] * ( qboxes[j, k, 1] - boxes[i, box_l, 1]) if cross >= 0: # qbox_overlap_box = False break if qbox_overlap_box is False: break if qbox_overlap_box: ret[i, j] = True # collision. else: ret[i, j] = True # collision. return ret @numba.njit def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): """Add noise to every box (only on the horizontal plane). Args: boxes (np.ndarray): Input boxes with shape (N, 5). valid_mask (np.ndarray): Mask to indicate which boxes are valid with shape (N). loc_noises (np.ndarray): Location noises with shape (N, M, 3). rot_noises (np.ndarray): Rotation noises with shape (N, M). Returns: np.ndarray: Mask to indicate whether the noise is added successfully (pass the collision test). """ num_boxes = boxes.shape[0] num_tests = loc_noises.shape[1] box_corners = box_np_ops.box2d_to_corner_jit(boxes) current_corners = np.zeros((4, 2), dtype=boxes.dtype) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) success_mask = -np.ones((num_boxes, ), dtype=np.int64) # print(valid_mask) for i in range(num_boxes): if valid_mask[i]: for j in range(num_tests): current_corners[:] = box_corners[i] current_corners -= boxes[i, :2] _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) current_corners += boxes[i, :2] + loc_noises[i, j, :2] coll_mat = box_collision_test( current_corners.reshape(1, 4, 2), box_corners) coll_mat[0, i] = False # print(coll_mat) if not coll_mat.any(): success_mask[i] = j box_corners[i] = current_corners break return success_mask @numba.njit def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, global_rot_noises): """Add noise to every box (only on the horizontal plane). Version 2 used when enable global rotations. Args: boxes (np.ndarray): Input boxes with shape (N, 5). valid_mask (np.ndarray): Mask to indicate which boxes are valid with shape (N). loc_noises (np.ndarray): Location noises with shape (N, M, 3). rot_noises (np.ndarray): Rotation noises with shape (N, M). Returns: np.ndarray: Mask to indicate whether the noise is added successfully (pass the collision test). """ num_boxes = boxes.shape[0] num_tests = loc_noises.shape[1] box_corners = box_np_ops.box2d_to_corner_jit(boxes) current_corners = np.zeros((4, 2), dtype=boxes.dtype) current_box = np.zeros((1, 5), dtype=boxes.dtype) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) dst_pos = np.zeros((2, ), dtype=boxes.dtype) success_mask = -np.ones((num_boxes, ), dtype=np.int64) corners_norm = np.zeros((4, 2), dtype=boxes.dtype) corners_norm[1, 1] = 1.0 corners_norm[2] = 1.0 corners_norm[3, 0] = 1.0 corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) corners_norm = corners_norm.reshape(4, 2) for i in range(num_boxes): if valid_mask[i]: for j in range(num_tests): current_box[0, :] = boxes[i] current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2) current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) dst_grot = current_grot + global_rot_noises[i, j] dst_pos[0] = current_radius * np.sin(dst_grot) dst_pos[1] = current_radius * np.cos(dst_grot) current_box[0, :2] = dst_pos current_box[0, -1] += (dst_grot - current_grot) rot_sin = np.sin(current_box[0, -1]) rot_cos = np.cos(current_box[0, -1]) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = -rot_sin rot_mat_T[1, 0] = rot_sin rot_mat_T[1, 1] = rot_cos current_corners[:] = current_box[ 0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] current_corners -= current_box[0, :2] _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) current_corners += current_box[0, :2] + loc_noises[i, j, :2] coll_mat = box_collision_test( current_corners.reshape(1, 4, 2), box_corners) coll_mat[0, i] = False if not coll_mat.any(): success_mask[i] = j box_corners[i] = current_corners loc_noises[i, j, :2] += (dst_pos - boxes[i, :2]) rot_noises[i, j] += (dst_grot - current_grot) break return success_mask def _select_transform(transform, indices): """Select transform. Args: transform (np.ndarray): Transforms to select from. indices (np.ndarray): Mask to indicate which transform to select. Returns: np.ndarray: Selected transforms. """ result = np.zeros((transform.shape[0], *transform.shape[2:]), dtype=transform.dtype) for i in range(transform.shape[0]): if indices[i] != -1: result[i] = transform[i, indices[i]] return result @numba.njit def _rotation_matrix_3d_(rot_mat_T, angle, axis): """Get the 3D rotation matrix. Args: rot_mat_T (np.ndarray): Transposed rotation matrix. angle (float): Rotation angle. axis (int): Rotation axis. """ rot_sin = np.sin(angle) rot_cos = np.cos(angle) rot_mat_T[:] = np.eye(3) if axis == 1: rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 2] = -rot_sin rot_mat_T[2, 0] = rot_sin rot_mat_T[2, 2] = rot_cos elif axis == 2 or axis == -1: rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = -rot_sin rot_mat_T[1, 0] = rot_sin rot_mat_T[1, 1] = rot_cos elif axis == 0: rot_mat_T[1, 1] = rot_cos rot_mat_T[1, 2] = -rot_sin rot_mat_T[2, 1] = rot_sin rot_mat_T[2, 2] = rot_cos @numba.njit def points_transform_(points, centers, point_masks, loc_transform, rot_transform, valid_mask): """Apply transforms to points and box centers. Args: points (np.ndarray): Input points. centers (np.ndarray): Input box centers. point_masks (np.ndarray): Mask to indicate which points need to be transformed. loc_transform (np.ndarray): Location transform to be applied. rot_transform (np.ndarray): Rotation transform to be applied. valid_mask (np.ndarray): Mask to indicate which boxes are valid. """ num_box = centers.shape[0] num_points = points.shape[0] rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) for i in range(num_box): _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) for i in range(num_points): for j in range(num_box): if valid_mask[j]: if point_masks[i, j] == 1: points[i, :3] -= centers[j, :3] points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j] points[i, :3] += centers[j, :3] points[i, :3] += loc_transform[j] break # only apply first box's transform @numba.njit def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): """Transform 3D boxes. Args: boxes (np.ndarray): 3D boxes to be transformed. loc_transform (np.ndarray): Location transform to be applied. rot_transform (np.ndarray): Rotation transform to be applied. valid_mask (np.ndarray | None): Mask to indicate which boxes are valid. """ num_box = boxes.shape[0] for i in range(num_box): if valid_mask[i]: boxes[i, :3] += loc_transform[i] boxes[i, 6] += rot_transform[i] def noise_per_object_v3_(gt_boxes, points=None, valid_mask=None, rotation_perturb=np.pi / 4, center_noise_std=1.0, global_random_rot_range=np.pi / 4, num_try=100): """Random rotate or remove each groundtruth independently. use kitti viewer to test this function points_transform_ Args: gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7). points (np.ndarray | None): Input point cloud with shape (M, 4). Default: None. valid_mask (np.ndarray | None): Mask to indicate which boxes are valid. Default: None. rotation_perturb (float): Rotation perturbation. Default: pi / 4. center_noise_std (float): Center noise standard deviation. Default: 1.0. global_random_rot_range (float): Global random rotation range. Default: pi/4. num_try (int): Number of try. Default: 100. """ num_boxes = gt_boxes.shape[0] if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): rotation_perturb = [-rotation_perturb, rotation_perturb] if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): global_random_rot_range = [ -global_random_rot_range, global_random_rot_range ] enable_grot = np.abs(global_random_rot_range[0] - global_random_rot_range[1]) >= 1e-3 if not isinstance(center_noise_std, (list, tuple, np.ndarray)): center_noise_std = [ center_noise_std, center_noise_std, center_noise_std ] if valid_mask is None: valid_mask = np.ones((num_boxes, ), dtype=np.bool_) center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) loc_noises = np.random.normal( scale=center_noise_std, size=[num_boxes, num_try, 3]) rot_noises = np.random.uniform( rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try]) gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) grot_lowers = global_random_rot_range[0] - gt_grots grot_uppers = global_random_rot_range[1] - gt_grots global_rot_noises = np.random.uniform( grot_lowers[..., np.newaxis], grot_uppers[..., np.newaxis], size=[num_boxes, num_try]) origin = (0.5, 0.5, 0) gt_box_corners = box_np_ops.center_to_corner_box3d( gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2) # TODO: rewrite this noise box function? if not enable_grot: selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises) else: selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises, global_rot_noises) loc_transforms = _select_transform(loc_noises, selected_noise) rot_transforms = _select_transform(rot_noises, selected_noise) surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) if points is not None: # TODO: replace this points_in_convex function by my tools? point_masks = box_np_ops.points_in_convex_polygon_3d_jit( points[:, :3], surfaces) points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms, rot_transforms, valid_mask) box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) ================================================ FILE: mmdet3d/datasets/pipelines/dbsampler.py ================================================ import copy import mmcv import numpy as np import os from mmdet3d.core.bbox import box_np_ops from mmdet3d.datasets.pipelines import data_augment_utils from mmdet.datasets import PIPELINES from ..registry import OBJECTSAMPLERS class BatchSampler: """Class for sampling specific category of ground truths. Args: sample_list (list[dict]): List of samples. name (str | None): The category of samples. Default: None. epoch (int | None): Sampling epoch. Default: None. shuffle (bool): Whether to shuffle indices. Default: False. drop_reminder (bool): Drop reminder. Default: False. """ def __init__(self, sampled_list, name=None, epoch=None, shuffle=True, drop_reminder=False): self._sampled_list = sampled_list self._indices = np.arange(len(sampled_list)) if shuffle: np.random.shuffle(self._indices) self._idx = 0 self._example_num = len(sampled_list) self._name = name self._shuffle = shuffle self._epoch = epoch self._epoch_counter = 0 self._drop_reminder = drop_reminder def _sample(self, num): """Sample specific number of ground truths and return indices. Args: num (int): Sampled number. Returns: list[int]: Indices of sampled ground truths. """ if self._idx + num >= self._example_num: ret = self._indices[self._idx:].copy() self._reset() else: ret = self._indices[self._idx:self._idx + num] self._idx += num return ret def _reset(self): """Reset the index of batchsampler to zero.""" assert self._name is not None # print("reset", self._name) if self._shuffle: np.random.shuffle(self._indices) self._idx = 0 def sample(self, num): """Sample specific number of ground truths. Args: num (int): Sampled number. Returns: list[dict]: Sampled ground truths. """ indices = self._sample(num) return [self._sampled_list[i] for i in indices] @OBJECTSAMPLERS.register_module() class DataBaseSampler(object): """Class for sampling data from the ground truth database. Args: info_path (str): Path of groundtruth database info. data_root (str): Path of groundtruth database. rate (float): Rate of actual sampled over maximum sampled number. prepare (dict): Name of preparation functions and the input value. sample_groups (dict): Sampled classes and numbers. classes (list[str]): List of classes. Default: None. points_loader(dict): Config of points loader. Default: dict( type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3]) """ def __init__(self, info_path, data_root, rate, prepare, sample_groups, classes=None, points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=[0, 1, 2, 3])): super().__init__() self.data_root = data_root self.info_path = info_path self.rate = rate self.prepare = prepare self.classes = classes self.cat2label = {name: i for i, name in enumerate(classes)} self.label2cat = {i: name for i, name in enumerate(classes)} self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES) db_infos = mmcv.load(info_path) # filter database infos from mmdet3d.utils import get_root_logger logger = get_root_logger() for k, v in db_infos.items(): logger.info(f'load {len(v)} {k} database infos') for prep_func, val in prepare.items(): db_infos = getattr(self, prep_func)(db_infos, val) logger.info('After filter database:') for k, v in db_infos.items(): logger.info(f'load {len(v)} {k} database infos') self.db_infos = db_infos # load sample groups # TODO: more elegant way to load sample groups self.sample_groups = [] for name, num in sample_groups.items(): self.sample_groups.append({name: int(num)}) self.group_db_infos = self.db_infos # just use db_infos self.sample_classes = [] self.sample_max_nums = [] for group_info in self.sample_groups: self.sample_classes += list(group_info.keys()) self.sample_max_nums += list(group_info.values()) self.sampler_dict = {} for k, v in self.group_db_infos.items(): self.sampler_dict[k] = BatchSampler(v, k, shuffle=True) # TODO: No group_sampling currently @staticmethod def filter_by_difficulty(db_infos, removed_difficulty): """Filter ground truths by difficulties. Args: db_infos (dict): Info of groundtruth database. removed_difficulty (list): Difficulties that are not qualified. Returns: dict: Info of database after filtering. """ new_db_infos = {} for key, dinfos in db_infos.items(): new_db_infos[key] = [ info for info in dinfos if info['difficulty'] not in removed_difficulty ] return new_db_infos @staticmethod def filter_by_min_points(db_infos, min_gt_points_dict): """Filter ground truths by number of points in the bbox. Args: db_infos (dict): Info of groundtruth database. min_gt_points_dict (dict): Different number of minimum points needed for different categories of ground truths. Returns: dict: Info of database after filtering. """ for name, min_num in min_gt_points_dict.items(): min_num = int(min_num) if min_num > 0: filtered_infos = [] for info in db_infos[name]: if info['num_points_in_gt'] >= min_num: filtered_infos.append(info) db_infos[name] = filtered_infos return db_infos def sample_all(self, gt_bboxes, gt_labels, img=None): """Sampling all categories of bboxes. Args: gt_bboxes (np.ndarray): Ground truth bounding boxes. gt_labels (np.ndarray): Ground truth labels of boxes. Returns: dict: Dict of sampled 'pseudo ground truths'. - gt_labels_3d (np.ndarray): ground truths labels \ of sampled objects. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): \ sampled ground truth 3D bounding boxes - points (np.ndarray): sampled points - group_ids (np.ndarray): ids of sampled ground truths """ sampled_num_dict = {} sample_num_per_class = [] for class_name, max_sample_num in zip(self.sample_classes, self.sample_max_nums): class_label = self.cat2label[class_name] # sampled_num = int(max_sample_num - # np.sum([n == class_name for n in gt_names])) sampled_num = int(max_sample_num - np.sum([n == class_label for n in gt_labels])) sampled_num = np.round(self.rate * sampled_num).astype(np.int64) sampled_num_dict[class_name] = sampled_num sample_num_per_class.append(sampled_num) sampled = [] sampled_gt_bboxes = [] avoid_coll_boxes = gt_bboxes for class_name, sampled_num in zip(self.sample_classes, sample_num_per_class): if sampled_num > 0: sampled_cls = self.sample_class_v2(class_name, sampled_num, avoid_coll_boxes) sampled += sampled_cls if len(sampled_cls) > 0: if len(sampled_cls) == 1: sampled_gt_box = sampled_cls[0]['box3d_lidar'][ np.newaxis, ...] else: sampled_gt_box = np.stack( [s['box3d_lidar'] for s in sampled_cls], axis=0) sampled_gt_bboxes += [sampled_gt_box] avoid_coll_boxes = np.concatenate( [avoid_coll_boxes, sampled_gt_box], axis=0) ret = None if len(sampled) > 0: sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0) # center = sampled_gt_bboxes[:, 0:3] # num_sampled = len(sampled) s_points_list = [] count = 0 for info in sampled: file_path = os.path.join( self.data_root, info['path']) if self.data_root else info['path'] results = dict(pts_filename=file_path) s_points = self.points_loader(results)['points'] s_points.translate(info['box3d_lidar'][:3]) count += 1 s_points_list.append(s_points) gt_labels = np.array([self.cat2label[s['name']] for s in sampled], dtype=np.long) ret = { 'gt_labels_3d': gt_labels, 'gt_bboxes_3d': sampled_gt_bboxes, 'points': s_points_list[0].cat(s_points_list), 'group_ids': np.arange(gt_bboxes.shape[0], gt_bboxes.shape[0] + len(sampled)) } return ret def sample_class_v2(self, name, num, gt_bboxes): """Sampling specific categories of bounding boxes. Args: name (str): Class of objects to be sampled. num (int): Number of sampled bboxes. gt_bboxes (np.ndarray): Ground truth boxes. Returns: list[dict]: Valid samples after collision test. """ sampled = self.sampler_dict[name].sample(num) sampled = copy.deepcopy(sampled) num_gt = gt_bboxes.shape[0] num_sampled = len(sampled) gt_bboxes_bv = box_np_ops.center_to_corner_box2d( gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6]) sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0) boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy() sp_boxes_new = boxes[gt_bboxes.shape[0]:] sp_boxes_bv = box_np_ops.center_to_corner_box2d( sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6]) total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0) coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv) diag = np.arange(total_bv.shape[0]) coll_mat[diag, diag] = False valid_samples = [] for i in range(num_gt, num_gt + num_sampled): if coll_mat[i].any(): coll_mat[i] = False coll_mat[:, i] = False else: valid_samples.append(sampled[i - num_gt]) return valid_samples ================================================ FILE: mmdet3d/datasets/pipelines/formating.py ================================================ import numpy as np from mmcv.parallel import DataContainer as DC from mmdet3d.core.bbox import BaseInstance3DBoxes from mmdet3d.core.points import BasePoints from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import to_tensor PIPELINES._module_dict.pop('DefaultFormatBundle') @PIPELINES.register_module() class DefaultFormatBundle(object): """Default formatting bundle. It simplifies the pipeline of formatting common fields, including "img", "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - proposals: (1)to tensor, (2)to DataContainer - gt_bboxes: (1)to tensor, (2)to DataContainer - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ (3)to DataContainer (stack=True) """ def __init__(self, ): return def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ if 'img' in results: if isinstance(results['img'], list): # process multiple imgs in single frame imgs = [img.transpose(2, 0, 1) for img in results['img']] imgs = np.ascontiguousarray(np.stack(imgs, axis=0)) results['img'] = DC(to_tensor(imgs), stack=True) else: img = np.ascontiguousarray(results['img'].transpose(2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask', 'gt_pts_centers_view', 'gt_img_centers_view', 'gt_visible_3d' ]: if key not in results: continue if isinstance(results[key], list): results[key] = DC([to_tensor(res) for res in results[key]]) else: results[key] = DC(to_tensor(results[key])) if 'gt_bboxes_3d' in results: if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes): results['gt_bboxes_3d'] = DC( results['gt_bboxes_3d'], cpu_only=True) else: results['gt_bboxes_3d'] = DC( to_tensor(results['gt_bboxes_3d'])) if 'gt_bboxes_cam_view' in results: if isinstance(results['gt_bboxes_cam_view'], BaseInstance3DBoxes): results['gt_bboxes_cam_view'] = DC( results['gt_bboxes_cam_view'], cpu_only=True) else: results['gt_bboxes_cam_view'] = DC( to_tensor(results['gt_bboxes_cam_view'])) if 'gt_bboxes_lidar_view' in results: if isinstance(results['gt_bboxes_lidar_view'], BaseInstance3DBoxes): results['gt_bboxes_lidar_view'] = DC( results['gt_bboxes_lidar_view'], cpu_only=True) else: results['gt_bboxes_lidar_view'] = DC( to_tensor(results['gt_bboxes_lidar_view'])) if 'gt_masks' in results: results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) if 'gt_semantic_seg' in results: results['gt_semantic_seg'] = DC( to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) return results def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class Collect3D(object): """Collect data from the loader relevant to the specific task. This is usually the last stage of the data loader pipeline. Typically keys is set to some subset of "img", "proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - 'img_shape': shape of the image input to the network as a tuple \ (h, w, c). Note that images may be zero padded on the \ bottom/right if the batch tensor is larger than this shape. - 'scale_factor': a float indicating the preprocessing scale - 'flip': a boolean indicating if image flip transform was used - 'filename': path to the image file - 'ori_shape': original shape of the image as a tuple (h, w, c) - 'pad_shape': image shape after padding - 'lidar2img': transform from lidar to image - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ flipped horizontally - 'pcd_vertical_flip': a boolean indicating if point cloud is \ flipped vertically - 'box_mode_3d': 3D box mode - 'box_type_3d': 3D box type - 'img_norm_cfg': a dict of normalization information: - mean: per channel mean subtraction - std: per channel std divisor - to_rgb: bool indicating if bgr was converted to rgb - 'rect': rectification matrix - 'Trv2c': transformation from velodyne to camera coordinate - 'P2': transformation betweeen cameras - 'pcd_trans': point cloud transformations - 'sample_idx': sample index - 'pcd_scale_factor': point cloud scale factor - 'pcd_rotation': rotation applied to point cloud - 'pts_filename': path to point cloud file. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str], optional): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', \ 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', \ 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', \ 'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans', \ 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') """ def __init__(self, keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'pad_shape', 'scale_factor', 'flip', 'image_flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 'transformation_3d_flow', 'cam_intrinsic', 'lidar2cam_r', 'lidar2cam_t', 'valid_shape', 'img_scale_ratios', 'pcd_rotation_angle')): self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` will be converted to :obj:`mmcv.DataContainer`. Args: results (dict): Result dict contains the data to collect. Returns: dict: The result dict contains the following keys - keys in ``self.keys`` - ``img_metas`` """ data = {} img_metas = {} for key in self.meta_keys: if key in results: img_metas[key] = results[key] data['img_metas'] = DC(img_metas, cpu_only=True) for key in self.keys: data[key] = results[key] return data def __repr__(self): """str: Return a string that describes the module.""" return self.__class__.__name__ + '(keys={}, meta_keys={})'.format( self.keys, self.meta_keys) @PIPELINES.register_module() class DefaultFormatBundle3D(DefaultFormatBundle): """Default formatting bundle. It simplifies the pipeline of formatting common fields for voxels, including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - proposals: (1)to tensor, (2)to DataContainer - gt_bboxes: (1)to tensor, (2)to DataContainer - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer """ def __init__(self, class_names, with_gt=True, with_label=True): super(DefaultFormatBundle3D, self).__init__() self.class_names = class_names self.with_gt = with_gt self.with_label = with_label def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ # Format 3D data if 'points' in results: assert isinstance(results['points'], BasePoints) results['points'] = DC(results['points'].tensor) for key in ['voxels', 'coors', 'voxel_centers', 'num_points']: if key not in results: continue results[key] = DC(to_tensor(results[key]), stack=False) if self.with_gt: # Clean GT bboxes in the final if 'gt_bboxes_3d_mask' in results: gt_bboxes_3d_mask = results['gt_bboxes_3d_mask'] results['gt_bboxes_3d'] = results['gt_bboxes_3d'][ gt_bboxes_3d_mask] if 'gt_names_3d' in results: results['gt_names_3d'] = results['gt_names_3d'][ gt_bboxes_3d_mask] if 'gt_bboxes_mask' in results: gt_bboxes_mask = results['gt_bboxes_mask'] if 'gt_bboxes' in results: results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask] results['gt_names'] = results['gt_names'][gt_bboxes_mask] if self.with_label: if 'gt_names' in results and len(results['gt_names']) == 0: results['gt_labels'] = np.array([], dtype=np.int64) elif 'gt_names' in results and isinstance( results['gt_names'][0], list): # gt_labels might be a list of list in multi-view setting results['gt_labels'] = [ np.array([self.class_names.index(n) for n in res], dtype=np.int64) for res in results['gt_names'] ] elif 'gt_names' in results: results['gt_labels'] = np.array([ self.class_names.index(n) for n in results['gt_names'] ], dtype=np.int64) # we still assume one pipeline for one frame LiDAR # thus, the 3D name is list[string] if 'gt_names_3d' in results: results['gt_labels_3d'] = np.array([ self.class_names.index(n) for n in results['gt_names_3d'] ], dtype=np.int64) results = super(DefaultFormatBundle3D, self).__call__(results) return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(class_names={}, '.format(self.class_names) repr_str += 'with_gt={}, with_label={})'.format( self.with_gt, self.with_label) return repr_str ================================================ FILE: mmdet3d/datasets/pipelines/loading.py ================================================ import mmcv import numpy as np import torch import cv2 import copy from mmdet3d.core.points import BasePoints, get_points_type from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import LoadAnnotations @PIPELINES.register_module() class MyResize(object): """Resize images & bbox & mask. This transform resizes the input image to some scale. Bboxes and masks are then resized with the same scale factor. If the input dict contains the key "scale", then the scale in the input dict is used, otherwise the specified scale in the init method is used. If the input dict contains the key "scale_factor" (if MultiScaleFlipAug does not give img_scale but scale_factor), the actual scale will be computed by image shape and scale_factor. `img_scale` can either be a tuple (single-scale) or a list of tuple (multi-scale). There are 3 multiscale modes: - ``ratio_range is not None``: randomly sample a ratio from the ratio \ range and multiply it with the image scale. - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ sample a scale from the multiscale range. - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ sample a scale from multiple scales. Args: img_scale (tuple or list[tuple]): Images scales for resizing. multiscale_mode (str): Either "range" or "value". ratio_range (tuple[float]): (min_ratio, max_ratio) keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. backend (str): Image resize backend, choices are 'cv2' and 'pillow'. These two backends generates slightly different results. Defaults to 'cv2'. override (bool, optional): Whether to override `scale` and `scale_factor` so as to call resize twice. Default False. If True, after the first resizing, the existed `scale` and `scale_factor` will be ignored so the second resizing can be allowed. This option is a work-around for multiple times of resize in DETR. Defaults to False. """ def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, keep_ratio=True, bbox_clip_border=True, backend='cv2', override=False): if img_scale is None: self.img_scale = None else: if isinstance(img_scale, list): self.img_scale = img_scale else: self.img_scale = [img_scale] assert mmcv.is_list_of(self.img_scale, tuple) if ratio_range is not None: # mode 1: given a scale and a range of image ratio assert len(self.img_scale) == 1 else: # mode 2: given multiple scales or a range of scales assert multiscale_mode in ['value', 'range'] self.backend = backend self.multiscale_mode = multiscale_mode self.ratio_range = ratio_range self.keep_ratio = keep_ratio # TODO: refactor the override option in Resize self.override = override self.bbox_clip_border = bbox_clip_border @staticmethod def random_select(img_scales): """Randomly select an img_scale from given candidates. Args: img_scales (list[tuple]): Images scales for selection. Returns: (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ where ``img_scale`` is the selected image scale and \ ``scale_idx`` is the selected index in the given candidates. """ assert mmcv.is_list_of(img_scales, tuple) scale_idx = np.random.randint(len(img_scales)) img_scale = img_scales[scale_idx] return img_scale, scale_idx @staticmethod def random_sample(img_scales): """Randomly sample an img_scale when ``multiscale_mode=='range'``. Args: img_scales (list[tuple]): Images scale range for sampling. There must be two tuples in img_scales, which specify the lower and uper bound of image scales. Returns: (tuple, None): Returns a tuple ``(img_scale, None)``, where \ ``img_scale`` is sampled scale and None is just a placeholder \ to be consistent with :func:`random_select`. """ assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale, None @staticmethod def random_sample_ratio(img_scale, ratio_range): """Randomly sample an img_scale when ``ratio_range`` is specified. A ratio will be randomly sampled from the range specified by ``ratio_range``. Then it would be multiplied with ``img_scale`` to generate sampled scale. Args: img_scale (tuple): Images scale base to multiply with ratio. ratio_range (tuple[float]): The minimum and maximum ratio to scale the ``img_scale``. Returns: (tuple, None): Returns a tuple ``(scale, None)``, where \ ``scale`` is sampled ratio multiplied with ``img_scale`` and \ None is just a placeholder to be consistent with \ :func:`random_select`. """ assert isinstance(img_scale, tuple) and len(img_scale) == 2 min_ratio, max_ratio = ratio_range assert min_ratio <= max_ratio ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) return scale, None def _random_scale(self, results): """Randomly sample an img_scale according to ``ratio_range`` and ``multiscale_mode``. If ``ratio_range`` is specified, a ratio will be sampled and be multiplied with ``img_scale``. If multiple scales are specified by ``img_scale``, a scale will be sampled according to ``multiscale_mode``. Otherwise, single scale will be used. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: Two new keys 'scale` and 'scale_idx` are added into \ ``results``, which would be used by subsequent pipelines. """ if self.ratio_range is not None: scale, scale_idx = self.random_sample_ratio( self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: scale, scale_idx = self.img_scale[0], 0 elif self.multiscale_mode == 'range': scale, scale_idx = self.random_sample(self.img_scale) elif self.multiscale_mode == 'value': scale, scale_idx = self.random_select(self.img_scale) else: raise NotImplementedError results['scale'] = scale results['scale_idx'] = scale_idx def _resize_img(self, results): """Resize images with ``results['scale']``.""" imgs = results['img'] results['img'] = [imgs[i] for i in range(len(imgs))] for key in results.get('img_fields', ['img']): for idx in range(len(results['img'])): if self.keep_ratio: img, scale_factor = mmcv.imrescale( results[key][idx], results['scale'], return_scale=True, backend=self.backend) # the w_scale and h_scale has minor difference # a real fix should be done in the mmcv.imrescale in the future new_h, new_w = img.shape[:2] h, w = results[key][idx].shape[:2] w_scale = new_w / w h_scale = new_h / h else: img, w_scale, h_scale = mmcv.imresize( results[key][idx], results['scale'], return_scale=True, backend=self.backend) results[key][idx] = img scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results['img_shape'] = img.shape # in case that there is no padding results['pad_shape'] = img.shape results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio if 'valid_shape' in results: scaling = np.array([[w_scale, h_scale]]) results['valid_shape'] = results['valid_shape'] * scaling def _resize_bboxes(self, results): """Resize bounding boxes with ``results['scale_factor']``.""" for key in results.get('bbox_fields', []): bboxes = results[key] * results['scale_factor'] if self.bbox_clip_border: img_shape = results['img_shape'] bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) results[key] = bboxes def _resize_centers(self, results): centers = results['gt_img_centers_view'] centers[:, :2] = centers[:, :2] * results['scale_factor'][:2] img_shape = results['img_shape'] centers[:, 0] = np.clip(centers[:, 0], 0, img_shape[1]) centers[:, 1] = np.clip(centers[:, 1], 0, img_shape[0]) results['gt_img_centers_view'] = centers def _resize_masks(self, results): """Resize masks with ``results['scale']``""" for key in results.get('mask_fields', []): if results[key] is None: continue if self.keep_ratio: results[key] = results[key].rescale(results['scale']) else: results[key] = results[key].resize(results['img_shape'][:2]) def _resize_seg(self, results): """Resize semantic segmentation map with ``results['scale']``.""" for key in results.get('seg_fields', []): if self.keep_ratio: gt_seg = mmcv.imrescale( results[key], results['scale'], interpolation='nearest', backend=self.backend) else: gt_seg = mmcv.imresize( results[key], results['scale'], interpolation='nearest', backend=self.backend) results['gt_semantic_seg'] = gt_seg def _resize_camera(self, results): scale_factor = results['scale_factor'] w_scale = scale_factor[0] h_scale = scale_factor[1] scaling_matrix = np.array([ [w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1] ]) for i in range(len(results['cam_intrinsic'])): results['cam_intrinsic'][i] = scaling_matrix @ results['cam_intrinsic'][i] def __call__(self, results): """Call function to resize images, bounding boxes, masks, semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \ 'keep_ratio' keys are added into result dict. """ if 'scale' not in results: if 'scale_factor' in results: img_shape = results['img'][0].shape[:2] scale_factor = results['scale_factor'] assert isinstance(scale_factor, float) results['scale'] = tuple( [int(x * scale_factor) for x in img_shape][::-1]) else: self._random_scale(results) else: if not self.override: assert 'scale_factor' not in results, ( 'scale and scale_factor cannot be both set.') else: results.pop('scale') if 'scale_factor' in results: results.pop('scale_factor') self._random_scale(results) self._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) self._resize_seg(results) if 'gt_img_centers_view' in results: self._resize_centers(results) if 'cam_intrinsic' in results: self._resize_camera(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(img_scale={self.img_scale}, ' repr_str += f'multiscale_mode={self.multiscale_mode}, ' repr_str += f'ratio_range={self.ratio_range}, ' repr_str += f'keep_ratio={self.keep_ratio}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @PIPELINES.register_module() class MyNormalize(object): """Normalize the image. Added key is "img_norm_cfg". Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): """Call function to normalize images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Normalized results, 'img_norm_cfg' key is added into result dict. """ for key in results.get('img_fields', ['img']): for idx in range(len(results['img'])): results[key][idx] = mmcv.imnormalize(results[key][idx], self.mean, self.std, self.to_rgb) results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' return repr_str @PIPELINES.register_module() class MyPad(object): """Pad the image & mask. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. """ def __init__(self, size=None, size_divisor=None, pad_val=0): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" for key in results.get('img_fields', ['img']): if self.size is not None: padded_img = mmcv.impad( results[key], shape=self.size, pad_val=self.pad_val) elif self.size_divisor is not None: for idx in range(len(results[key])): padded_img = mmcv.impad_to_multiple( results[key][idx], self.size_divisor, pad_val=self.pad_val) results[key][idx] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_masks(self, results): """Pad masks according to ``results['pad_shape']``.""" pad_shape = results['pad_shape'][:2] for key in results.get('mask_fields', []): results[key] = results[key].pad(pad_shape, pad_val=self.pad_val) def _pad_seg(self, results): """Pad semantic segmentation map according to ``results['pad_shape']``.""" for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2]) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_masks(results) self._pad_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class LoadMultiViewImageFromFiles(object): """Load multi channel images from a list of separate channel files. Expects results['img_filename'] to be a list of filenames. Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. """ def __init__(self, to_float32=False, img_scale=None, color_type='unchanged'): self.to_float32 = to_float32 self.img_scale = img_scale self.color_type = color_type def pad(self, img): # to pad the 5 input images into a same size (for Waymo) if img.shape[0] != self.img_scale[0]: img = np.concatenate([img, np.zeros_like(img[0:1280-886,:])], axis=0) return img def __call__(self, results): """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. \ Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ filename = results['img_filename'] if self.img_scale is None: img = np.stack( [mmcv.imread(name, self.color_type) for name in filename], axis=-1) else: img = np.stack( [self.pad(mmcv.imread(name, self.color_type)) for name in filename], axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formating.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape results['ori_shape'] = img.shape # Set initial values for default meta_keys results['pad_shape'] = img.shape # results['scale_factor'] = [1.0, 1.0] num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['img_fields'] = ['img'] return results def __repr__(self): """str: Return a string that describes the module.""" return "{} (to_float32={}, color_type='{}')".format( self.__class__.__name__, self.to_float32, self.color_type) @PIPELINES.register_module() class LoadPointsFromMultiSweeps(object): """Load points from multiple sweeps. This is usually used for nuScenes dataset to utilize previous sweeps. Args: sweeps_num (int): Number of sweeps. Defaults to 10. load_dim (int): Dimension number of the loaded points. Defaults to 5. use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4]. file_client_args (dict): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). pad_empty_sweeps (bool): Whether to repeat keyframe when sweeps is empty. Defaults to False. remove_close (bool): Whether to remove close points. Defaults to False. test_mode (bool): If test_model=True used for testing, it will not randomly sample sweeps but select the nearest N frames. Defaults to False. """ def __init__(self, sweeps_num=10, load_dim=5, use_dim=[0, 1, 2, 4], file_client_args=dict(backend='disk'), pad_empty_sweeps=False, remove_close=False, test_mode=False): self.load_dim = load_dim self.sweeps_num = sweeps_num self.use_dim = use_dim self.file_client_args = file_client_args.copy() self.file_client = None self.pad_empty_sweeps = pad_empty_sweeps self.remove_close = remove_close self.test_mode = test_mode def _load_points(self, pts_filename): """Private function to load point clouds data. Args: pts_filename (str): Filename of point clouds data. Returns: np.ndarray: An array containing point clouds data. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: pts_bytes = self.file_client.get(pts_filename) points = np.frombuffer(pts_bytes, dtype=np.float32) except ConnectionError: mmcv.check_file_exist(pts_filename) if pts_filename.endswith('.npy'): points = np.load(pts_filename) else: points = np.fromfile(pts_filename, dtype=np.float32) return points def _remove_close(self, points, radius=1.0): """Removes point too close within a certain radius from origin. Args: points (np.ndarray): Sweep points. radius (float): Radius below which points are removed. Defaults to 1.0. Returns: np.ndarray: Points after removing. """ if isinstance(points, np.ndarray): points_numpy = points elif isinstance(points, BasePoints): points_numpy = points.tensor.numpy() else: raise NotImplementedError x_filt = np.abs(points_numpy[:, 0]) < radius y_filt = np.abs(points_numpy[:, 1]) < radius not_close = np.logical_not(np.logical_and(x_filt, y_filt)) return points[not_close] def __call__(self, results): """Call function to load multi-sweep point clouds from files. Args: results (dict): Result dict containing multi-sweep point cloud \ filenames. Returns: dict: The result dict containing the multi-sweep points data. \ Added key and value are described below. - points (np.ndarray): Multi-sweep point cloud arrays. """ points = results['points'] points.tensor[:, 4] = 0 sweep_points_list = [points] ts = results['timestamp'] if self.pad_empty_sweeps and len(results['sweeps']) == 0: for i in range(self.sweeps_num): if self.remove_close: sweep_points_list.append(self._remove_close(points)) else: sweep_points_list.append(points) else: if len(results['sweeps']) <= self.sweeps_num: choices = np.arange(len(results['sweeps'])) elif self.test_mode: choices = np.arange(self.sweeps_num) else: choices = np.random.choice( len(results['sweeps']), self.sweeps_num, replace=False) for idx in choices: sweep = results['sweeps'][idx] points_sweep = self._load_points(sweep['data_path']) points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim) if self.remove_close: points_sweep = self._remove_close(points_sweep) sweep_ts = sweep['timestamp'] / 1e6 points_sweep[:, :3] = points_sweep[:, :3] @ sweep[ 'sensor2lidar_rotation'].T points_sweep[:, :3] += sweep['sensor2lidar_translation'] points_sweep[:, 4] = ts - sweep_ts points_sweep = points.new_point(points_sweep) sweep_points_list.append(points_sweep) points = points.cat(sweep_points_list) points = points[:, self.use_dim] results['points'] = points return results def __repr__(self): """str: Return a string that describes the module.""" return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})' @PIPELINES.register_module() class PointSegClassMapping(object): """Map original semantic class to valid category ids. Map valid classes as 0~len(valid_cat_ids)-1 and others as len(valid_cat_ids). Args: valid_cat_ids (tuple[int]): A tuple of valid category. """ def __init__(self, valid_cat_ids): self.valid_cat_ids = valid_cat_ids def __call__(self, results): """Call function to map original semantic class to valid category ids. Args: results (dict): Result dict containing point semantic masks. Returns: dict: The result dict containing the mapped category ids. \ Updated key and value are described below. - pts_semantic_mask (np.ndarray): Mapped semantic masks. """ assert 'pts_semantic_mask' in results pts_semantic_mask = results['pts_semantic_mask'] neg_cls = len(self.valid_cat_ids) for i in range(pts_semantic_mask.shape[0]): if pts_semantic_mask[i] in self.valid_cat_ids: converted_id = self.valid_cat_ids.index(pts_semantic_mask[i]) pts_semantic_mask[i] = converted_id else: pts_semantic_mask[i] = neg_cls results['pts_semantic_mask'] = pts_semantic_mask return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(valid_cat_ids={})'.format(self.valid_cat_ids) return repr_str @PIPELINES.register_module() class NormalizePointsColor(object): """Normalize color of points. Args: color_mean (list[float]): Mean color of the point cloud. """ def __init__(self, color_mean): self.color_mean = color_mean def __call__(self, results): """Call function to normalize color of points. Args: results (dict): Result dict containing point clouds data. Returns: dict: The result dict containing the normalized points. \ Updated key and value are described below. - points (np.ndarray): Points after color normalization. """ points = results['points'] assert points.shape[1] >= 6, \ f'Expect points have channel >=6, got {points.shape[1]}' points[:, 3:6] = points[:, 3:6] - np.array(self.color_mean) / 256.0 results['points'] = points return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(color_mean={})'.format(self.color_mean) return repr_str @PIPELINES.register_module() class LoadPointsFromFile(object): """Load Points From File. Load sunrgbd and scannet points from file. Args: load_dim (int): The dimension of the loaded points. Defaults to 6. coord_type (str): The type of coordinates of points cloud. Available options includes: - 'LIDAR': Points in LiDAR coordinates. - 'DEPTH': Points in depth coordinates, usually for indoor dataset. - 'CAMERA': Points in camera coordinates. use_dim (list[int]): Which dimensions of the points to be used. Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4 or use_dim=[0, 1, 2, 3] to use the intensity dimension. shift_height (bool): Whether to use shifted height. Defaults to False. file_client_args (dict): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). """ def __init__(self, coord_type, load_dim=6, use_dim=[0, 1, 2], shift_height=False, file_client_args=dict(backend='disk')): self.shift_height = shift_height if isinstance(use_dim, int): use_dim = list(range(use_dim)) assert max(use_dim) < load_dim, \ f'Expect all used dimensions < {load_dim}, got {use_dim}' assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH'] self.coord_type = coord_type self.load_dim = load_dim self.use_dim = use_dim self.file_client_args = file_client_args.copy() self.file_client = None def _load_points(self, pts_filename): """Private function to load point clouds data. Args: pts_filename (str): Filename of point clouds data. Returns: np.ndarray: An array containing point clouds data. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: pts_bytes = self.file_client.get(pts_filename) points = np.frombuffer(pts_bytes, dtype=np.float32) except ConnectionError: mmcv.check_file_exist(pts_filename) if pts_filename.endswith('.npy'): points = np.load(pts_filename) else: points = np.fromfile(pts_filename, dtype=np.float32) return points def __call__(self, results): """Call function to load points data from file. Args: results (dict): Result dict containing point clouds data. Returns: dict: The result dict containing the point clouds data. \ Added key and value are described below. - points (np.ndarray): Point clouds data. """ pts_filename = results['pts_filename'] points = self._load_points(pts_filename) points = points.reshape(-1, self.load_dim) points = points[:, self.use_dim] attribute_dims = None if self.shift_height: floor_height = np.percentile(points[:, 2], 0.99) height = points[:, 2] - floor_height points = np.concatenate([points, np.expand_dims(height, 1)], 1) attribute_dims = dict(height=3) points_class = get_points_type(self.coord_type) points = points_class( points, points_dim=points.shape[-1], attribute_dims=attribute_dims) results['points'] = points return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ + '(' repr_str += 'shift_height={}, '.format(self.shift_height) repr_str += 'file_client_args={}), '.format(self.file_client_args) repr_str += 'load_dim={}, '.format(self.load_dim) repr_str += 'use_dim={})'.format(self.use_dim) return repr_str @PIPELINES.register_module() class LoadAnnotations3D(LoadAnnotations): """Load Annotations3D. Load instance mask and semantic mask of points and encapsulate the items into related fields. Args: with_bbox_3d (bool, optional): Whether to load 3D boxes. Defaults to True. with_label_3d (bool, optional): Whether to load 3D labels. Defaults to True. with_mask_3d (bool, optional): Whether to load 3D instance masks. for points. Defaults to False. with_seg_3d (bool, optional): Whether to load 3D semantic masks. for points. Defaults to False. with_bbox (bool, optional): Whether to load 2D boxes. Defaults to False. with_label (bool, optional): Whether to load 2D labels. Defaults to False. with_mask (bool, optional): Whether to load 2D instance masks. Defaults to False. with_seg (bool, optional): Whether to load 2D semantic masks. Defaults to False. poly2mask (bool, optional): Whether to convert polygon annotations to bitmasks. Defaults to True. seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks. Defaults to int64 file_client_args (dict): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. """ def __init__(self, with_bbox_3d=True, with_label_3d=True, with_mask_3d=False, with_seg_3d=False, with_bbox=False, with_label=False, with_mask=False, with_seg=False, poly2mask=True, seg_3d_dtype='int', file_client_args=dict(backend='disk')): super().__init__( with_bbox, with_label, with_mask, with_seg, poly2mask, file_client_args=file_client_args) self.with_bbox_3d = with_bbox_3d self.with_label_3d = with_label_3d self.with_mask_3d = with_mask_3d self.with_seg_3d = with_seg_3d self.seg_3d_dtype = seg_3d_dtype def _load_bboxes_3d(self, results): """Private function to load 3D bounding box annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d'] results['bbox3d_fields'].append('gt_bboxes_3d') return results def _load_labels_3d(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['gt_labels_3d'] = results['ann_info']['gt_labels_3d'] return results def _load_masks_3d(self, results): """Private function to load 3D mask annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D mask annotations. """ pts_instance_mask_path = results['ann_info']['pts_instance_mask_path'] if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: mask_bytes = self.file_client.get(pts_instance_mask_path) pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int) except ConnectionError: mmcv.check_file_exist(pts_instance_mask_path) pts_instance_mask = np.fromfile( pts_instance_mask_path, dtype=np.long) results['pts_instance_mask'] = pts_instance_mask results['pts_mask_fields'].append('pts_instance_mask') return results def _load_semantic_seg_3d(self, results): """Private function to load 3D semantic segmentation annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing the semantic segmentation annotations. """ pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path'] if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: mask_bytes = self.file_client.get(pts_semantic_mask_path) # add .copy() to fix read-only bug pts_semantic_mask = np.frombuffer( mask_bytes, dtype=self.seg_3d_dtype).copy() except ConnectionError: mmcv.check_file_exist(pts_semantic_mask_path) pts_semantic_mask = np.fromfile( pts_semantic_mask_path, dtype=np.long) results['pts_semantic_mask'] = pts_semantic_mask results['pts_seg_fields'].append('pts_semantic_mask') return results def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box, label, mask and semantic segmentation annotations. """ results = super().__call__(results) if self.with_bbox_3d: results = self._load_bboxes_3d(results) if results is None: return None if self.with_label_3d: results = self._load_labels_3d(results) if self.with_mask_3d: results = self._load_masks_3d(results) if self.with_seg_3d: results = self._load_semantic_seg_3d(results) return results def __repr__(self): """str: Return a string that describes the module.""" indent_str = ' ' repr_str = self.__class__.__name__ + '(\n' repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, ' repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, ' repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, ' repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, ' repr_str += f'{indent_str}with_bbox={self.with_bbox}, ' repr_str += f'{indent_str}with_label={self.with_label}, ' repr_str += f'{indent_str}with_mask={self.with_mask}, ' repr_str += f'{indent_str}with_seg={self.with_seg}, ' repr_str += f'{indent_str}poly2mask={self.poly2mask})' return repr_str @PIPELINES.register_module() class MyLoadAnnotations3D(LoadAnnotations3D): def __init__(self, with_bbox_3d=True, with_label_3d=True, with_mask_3d=False, with_seg_3d=False, with_bbox=False, with_label=False, with_mask=False, with_seg=False, poly2mask=True, with_centers=False, with_cam_bbox=False, with_visible=False, seg_3d_dtype='int', file_client_args=dict(backend='disk')): super().__init__( with_bbox_3d=with_bbox_3d, with_label_3d=with_label_3d, with_mask_3d=with_mask_3d, with_seg_3d=with_seg_3d, with_bbox=with_bbox, with_label=with_label, with_mask=with_mask, with_seg=with_seg, poly2mask=poly2mask, seg_3d_dtype=seg_3d_dtype, file_client_args=file_client_args) self.with_centers = with_centers self.with_cam_bbox = with_cam_bbox self.with_visible = with_visible def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box, label, mask and semantic segmentation annotations. """ results = super().__call__(results) if self.with_centers: results = self._load_centers_2d(results) if self.with_cam_bbox: results = self._load_cam_box(results) if self.with_visible: results = self._load_visible(results) return results def _load_centers_2d(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['gt_pts_centers_view'] = results['ann_info']['pts_centers_view'] results['gt_img_centers_view'] = results['ann_info']['img_centers_view'] return results def _load_cam_box(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['gt_bboxes_cam_view'] = results['ann_info']['bboxes_cam_view'] results['gt_bboxes_lidar_view'] = results['ann_info']['bboxes_lidar_view'] return results def _load_visible(self, results): results['gt_visible_3d'] = results['ann_info']['gt_visible_3d'] return results @PIPELINES.register_module() class SparseDepth(object): """ Generate a sparse depth map from the point clouds, the depth map should have the same size with image features """ def __init__(self, scale_factors, depth_mean=14.41, depth_var=156.89, exp_time=0): self.scale_factors = scale_factors self.depth_mean = depth_mean self.depth_var = depth_var self.exp_time = exp_time def __call__(self, results): all_points = results['points'].tensor curr_mask = all_points[:, 4] == 0 points = all_points[curr_mask] points = points[:, :3] points_4d = torch.cat([points, torch.ones_like(points[:, :1])], dim=1) lidar2cam_rs = results['lidar2cam_r'] lidar2cam_ts = results['lidar2cam_t'] cam_intrinsic = results['cam_intrinsic'] depth_features = [] for view_id in range(len(lidar2cam_rs)): if 'valid_shape' in results: h_shape = int(results['valid_shape'][view_id, 1]) w_shape = int(results['valid_shape'][view_id, 0]) else: h_shape = results['pad_shape'][0] w_shape = results['pad_shape'][1] cam_ext = np.eye(4) cam_int = np.eye(4) cam_ext[:3, :3] = lidar2cam_rs[view_id] cam_ext[:3, 3] = lidar2cam_ts[view_id] cam_int[:3, :3] = cam_intrinsic[view_id] cam_ext = torch.from_numpy(cam_ext).type_as(points_4d) cam_int = torch.from_numpy(cam_int).type_as(points_4d) points_4d_view = points_4d @ cam_ext.T points_4d_view = points_4d_view @ cam_int.T points_2d_view = points_4d_view[:, :2] depth = points_4d_view[:, 2] depth = torch.clamp(depth, min=1e-4) points_2d_view[:, 0] = points_2d_view[:, 0] / depth points_2d_view[:, 1] = points_2d_view[:, 1] / depth valid_mask = (points_2d_view[:, 0] > 0) & (points_2d_view[:, 0] < w_shape-1) & \ (points_2d_view[:, 1] > 0) & (points_2d_view[:, 1] < h_shape-1) points_2d_view = points_2d_view[valid_mask] depth = depth[valid_mask] sort_id = np.argsort(-depth) points_2d_view = points_2d_view[sort_id] depth = depth[sort_id] depth_features_view = [] w_scale_shape = results['pad_shape'][1] // self.scale_factors[0] h_scale_shape = results['pad_shape'][0] // self.scale_factors[0] for scale in self.scale_factors: w_scale_factor = 1.0 / scale h_scale_factor = 1.0 / scale scale_factor = torch.Tensor([[w_scale_factor, h_scale_factor]]) depth_feature = torch.zeros((2, h_scale_shape, w_scale_shape)) points_2d_view_scale = points_2d_view * scale_factor cx = points_2d_view_scale[:, 0].long() cy = points_2d_view_scale[:, 1].long() depth_feature[0, cy, cx] = depth depth_feature[1, cy, cx] = 1 if self.exp_time > 0: zero_inds = depth_feature[1] == 0 depth_map = depth_feature[0] depth_map[zero_inds] = 9999 for i in range(self.exp_time): depth_feature_new = torch.zeros_like(depth_map) + 9999 depth_feature_new[1:] = torch.minimum(depth_feature_new[1:], depth_map[:-1]) depth_feature_new[:-1] = torch.minimum(depth_feature_new[:-1], depth_map[1:]) depth_feature_new[:, 1:] = torch.minimum(depth_feature_new[:, 1:], depth_map[:, :-1]) depth_feature_new[:, :-1] = torch.minimum(depth_feature_new[:, :-1], depth_map[:, 1:]) depth_map = torch.where(zero_inds, depth_feature_new, depth_map) zero_inds = depth_map == 9999 depth_map[zero_inds] = 0 depth_feature[0] = depth_map depth_feature[1, torch.logical_not(zero_inds)] = 1 depth_features_view.append(depth_feature) depth_features_view = torch.stack(depth_features_view, dim=0) # [num_scale, 2, h_scale_shape, w_scale_shape) depth_features.append(depth_features_view) depth_features = torch.stack(depth_features, dim=0) # [num_view, num_scale, 2, h_scale_shape, w_scale_shape) depth_features[:, :, 0] = (depth_features[:, :, 0] - self.depth_mean) / np.sqrt(self.depth_var) depth_features[:, :, 0] = depth_features[:, :, 0] * depth_features[:, :, 1] results['sparse_depth'] = depth_features return results ================================================ FILE: mmdet3d/datasets/pipelines/test_time_aug.py ================================================ import mmcv import warnings from copy import deepcopy from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import Compose @PIPELINES.register_module() class MultiScaleFlipAug3D(object): """Test-time augmentation with multiple scales and flipping. Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple]: Images scales for resizing. pts_scale_ratio (float | list[float]): Points scale ratios for resizing. flip (bool): Whether apply flip augmentation. Defaults to False. flip_direction (str | list[str]): Flip augmentation directions for images, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when ``flip == False``. Defaults to "horizontal". pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. pcd_vertical_flip (bool): Whether apply vertical flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. """ def __init__(self, transforms, img_scale, pts_scale_ratio, pts_rotation=0, flip=False, flip_direction='horizontal', pcd_horizontal_flip=False, pcd_vertical_flip=False): self.transforms = Compose(transforms) self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] self.pts_scale_ratio = pts_scale_ratio \ if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)] self.pts_rotation = pts_rotation if isinstance(pts_rotation, list) else[float(pts_rotation)] assert mmcv.is_list_of(self.img_scale, tuple) assert mmcv.is_list_of(self.pts_scale_ratio, float) assert mmcv.is_list_of(self.pts_rotation, float) self.flip = flip self.pcd_horizontal_flip = pcd_horizontal_flip self.pcd_vertical_flip = pcd_vertical_flip self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([(t['type'] == 'RandomFlip3D' or t['type'] == 'RandomFlip') for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to augment common fields in results. Args: results (dict): Result dict contains the data to augment. Returns: dict: The result dict contains the data that is augmented with \ different scales and flips. """ aug_data = [] # modified from `flip_aug = [False, True] if self.flip else [False]` # to reduce unnecessary scenes when using double flip augmentation # during test time flip_aug = [True] if self.flip else [False] pcd_horizontal_flip_aug = [False, True] \ if self.flip and self.pcd_horizontal_flip else [False] pcd_vertical_flip_aug = [False, True] \ if self.flip and self.pcd_vertical_flip else [False] for scale in self.img_scale: for pts_scale_ratio in self.pts_scale_ratio: for pts_rotation in self.pts_rotation: for flip in flip_aug: for pcd_horizontal_flip in pcd_horizontal_flip_aug: for pcd_vertical_flip in pcd_vertical_flip_aug: for direction in self.flip_direction: # results.copy will cause bug # since it is shallow copy _results = deepcopy(results) _results['scale'] = scale _results['flip'] = flip _results['pcd_scale_factor'] = \ pts_scale_ratio _results['flip_direction'] = direction _results['pcd_horizontal_flip'] = \ pcd_horizontal_flip _results['pcd_vertical_flip'] = \ pcd_vertical_flip _results['pcd_rotation_angle'] = pts_rotation data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, ' repr_str += f'flip_direction={self.flip_direction})' return repr_str ================================================ FILE: mmdet3d/datasets/pipelines/transforms_2d.py ================================================ import copy import inspect import math import warnings import cv2 import mmcv import numpy as np from numpy import random from mmdet.datasets.builder import PIPELINES @PIPELINES.register_module() class OurRandomAffine: """Random affine transform data augmentation. This operation randomly generates affine transform matrix which including rotation, translation, shear and scaling transforms. Args: max_rotate_degree (float): Maximum degrees of rotation transform. Default: 10. max_translate_ratio (float): Maximum ratio of translation. Default: 0.1. scaling_ratio_range (tuple[float]): Min and max ratio of scaling transform. Default: (0.5, 1.5). max_shear_degree (float): Maximum degrees of shear transform. Default: 2. border (tuple[int]): Distance from height and width sides of input image to adjust output shape. Only used in mosaic dataset. Default: (0, 0). border_val (tuple[int]): Border padding values of 3 channels. Default: (114, 114, 114). min_bbox_size (float): Width and height threshold to filter bboxes. If the height or width of a box is smaller than this value, it will be removed. Default: 2. min_area_ratio (float): Threshold of area ratio between original bboxes and wrapped bboxes. If smaller than this value, the box will be removed. Default: 0.2. max_aspect_ratio (float): Aspect ratio of width and height threshold to filter bboxes. If max(h/w, w/h) larger than this value, the box will be removed. bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. skip_filter (bool): Whether to skip filtering rules. If it is True, the filter rule will not be applied, and the `min_bbox_size` and `min_area_ratio` and `max_aspect_ratio` is invalid. Default to True. """ def __init__(self, # max_translate_ratio=0.1, scaling_ratio_range=(0.5, 1.5), flip_ratio=0.5, border=(0, 0), border_val=(103.53, 116.28, 123.675), bbox_clip_border=True, flip_sync_3d=False, scaling_sync_view=False, trans_when_scaling=True, ): # assert 0 <= max_translate_ratio <= 1 assert scaling_ratio_range[0] <= scaling_ratio_range[1] assert scaling_ratio_range[0] > 0 # self.max_translate_ratio = max_translate_ratio self.scaling_ratio_range = scaling_ratio_range self.flip_ratio = flip_ratio self.border = border self.border_val = border_val self.bbox_clip_border = bbox_clip_border self.flip_sync = flip_sync_3d self.scaling_sync_view = scaling_sync_view self.trans_when_scaling = trans_when_scaling def _transform_bbox(self, results, warp_mats, flips, width, height): valid_mask = np.ones(results['gt_labels'].shape[0]) > 0 if 'gt_bboxes_cam_view' in results: bboxes_cam = results['gt_bboxes_cam_view'] else: bboxes_cam = None for view_id in range(len(warp_mats)): warp_matrix = warp_mats[view_id] bbox_mask = results['gt_labels'][:, 1] == view_id if np.sum(bbox_mask) == 0: continue flip = flips[view_id] flip_matrix = self._get_flip_matrix(flip, width) if bboxes_cam is not None: if flip: bboxes_cam.tensor[bbox_mask, 0::7] = -bboxes_cam.tensor[bbox_mask, 0::7] bboxes_cam.tensor[bbox_mask, 6] = -bboxes_cam.tensor[bbox_mask, 6] + np.pi bbox_view = results['gt_bboxes'][bbox_mask] centers_view = results['gt_img_centers_view'][bbox_mask, :2] num_bboxes = bbox_view.shape[0] xtl = bbox_view[:, 0] - bbox_view[:, 2] / 2 ytl = bbox_view[:, 1] - bbox_view[:, 3] / 2 xtr = bbox_view[:, 0] + bbox_view[:, 2] / 2 ytr = bbox_view[:, 1] - bbox_view[:, 3] / 2 xbl = bbox_view[:, 0] - bbox_view[:, 2] / 2 ybl = bbox_view[:, 1] + bbox_view[:, 3] / 2 xbr = bbox_view[:, 0] + bbox_view[:, 2] / 2 ybr = bbox_view[:, 1] + bbox_view[:, 3] / 2 xs = np.vstack([xtl, xtr, xbl, xbr]).T # [N, 4] ys = np.vstack([ytl, ytr, ybl, ybr]).T # [N, 4] xs = xs.reshape(-1) # [N*4,] ys = ys.reshape(-1) # [N*4,] ones = np.ones_like(ys) points = np.vstack([xs, ys, ones]) # [3, N*4] warp_points = warp_matrix @ flip_matrix @ points # [3, N*4] warp_points = warp_points[:2] / warp_points[2] xs = warp_points[0].reshape(num_bboxes, 4) # [N, 4] ys = warp_points[1].reshape(num_bboxes, 4) # [N, 4] xs_min = xs.min(1) # [N, ] ys_min = ys.min(1) # [N, ] xs_max = xs.max(1) # [N, ] ys_max = ys.max(1) # [N, ] if self.bbox_clip_border: xs_min = xs_min.clip(0, width) xs_max = xs_max.clip(0, width) ys_min = ys_min.clip(0, height) ys_max = ys_max.clip(0, height) cxs = (xs_min + xs_max) / 2 cys = (ys_min + ys_max) / 2 ws = xs_max - xs_min hs = ys_max - ys_min warp_bboxes = np.vstack((cxs, cys, ws, hs)).T # [N, 4] ones = np.ones_like(centers_view[:, :1]) # [N, 1] center_points = np.concatenate([centers_view, ones], axis=1).T # [3, N] warp_points = warp_matrix @ flip_matrix @ center_points # [3, N] warp_points = warp_points[:2] / warp_points[2] new_center_points = warp_points.T # [N, 2] valid_mask_view = (new_center_points[:, 0] > 0) & (new_center_points[:, 0] < width-1) & (new_center_points[:, 1] > 0) & (new_center_points[:, 1] < height-1) valid_mask[bbox_mask] = valid_mask_view results['gt_bboxes'][bbox_mask] = warp_bboxes results['gt_img_centers_view'][bbox_mask, :2] = new_center_points if 'gt_bboxes_cam_view' in results: results['gt_bboxes_cam_view'] = bboxes_cam[valid_mask] results['gt_bboxes_lidar_view'] = results['gt_bboxes_lidar_view'][valid_mask] results['gt_bboxes'] = results['gt_bboxes'][valid_mask] results['gt_img_centers_view'] = results['gt_img_centers_view'][valid_mask] results['gt_pts_centers_view'] = results['gt_pts_centers_view'][valid_mask] results['gt_labels'] = results['gt_labels'][valid_mask] return results def _transform_camera(self, results, warp_mats, flips, width): for id in range(len(warp_mats)): flip = flips[id] flip_matrix = self._get_flip_matrix(flip, width) intrinsic = results['cam_intrinsic'][id] warp_matrix = warp_mats[id] @ flip_matrix # intrinsic = warp_matrix @ intrinsic # results['cam_intrinsic'][id] = intrinsic viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = warp_matrix results['lidar2img'][id] = viewpad @ results['lidar2img'][id] if flip: flip_matrix = np.eye(3) flip_matrix[0, 0] = -1 results['lidar2cam_r'][id] = flip_matrix @ results['lidar2cam_r'][id] results['lidar2cam_t'][id] = flip_matrix @ results['lidar2cam_t'][id] results['cam_intrinsic'][id][0, 2] = width - results['cam_intrinsic'][id][0, 2] intrinsic = warp_mats[id] @ intrinsic results['cam_intrinsic'][id] = intrinsic return results def __call__(self, results): translate_mats = [] scale_mats = [] warp_mats = [] flips = [] scaling_ratios = [] valid_shapes = [] results['image_flip'] = [] flip_3d = False if 'pcd_horizontal_flip' in results and results['pcd_horizontal_flip'] == True: flip_3d = not flip_3d if 'pcd_vertical_flip' in results and results['pcd_vertical_flip'] == True: flip_3d = not flip_3d if self.scaling_sync_view: scaling_ratio = random.uniform(self.scaling_ratio_range[0], self.scaling_ratio_range[1]) for view_id in range(len(results['img'])): img = results['img'][view_id] height = img.shape[0] + self.border[0] * 2 width = img.shape[1] + self.border[1] * 2 if self.flip_sync: flip = flip_3d else: flip = True if np.random.random() < self.flip_ratio else False flips.append(flip) if flip: results['image_flip'].append(True) img = cv2.flip(img, 1) else: results['image_flip'].append(False) # Scaling if not self.scaling_sync_view: scaling_ratio = random.uniform(self.scaling_ratio_range[0], self.scaling_ratio_range[1]) scaling_matrix = self._get_scaling_matrix(scaling_ratio) scaling_ratios.append(scaling_ratio) reduction_ratio = min(1.0, scaling_ratio) valid_shapes.append([reduction_ratio*width, reduction_ratio*height]) # Translation if self.trans_when_scaling: if scaling_ratio <= 1: trans_x = 0 trans_y = 0 else: trans_x = random.uniform((1 - scaling_ratio) * width, 0) trans_y = random.uniform((1 - scaling_ratio) * height, 0) else: trans_x = 0 trans_y = 0 # trans_x = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * width # trans_y = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * height translate_matrix = self._get_translation_matrix(trans_x, trans_y) warp_matrix = translate_matrix @ scaling_matrix img = cv2.warpPerspective( img, warp_matrix, dsize=(width, height), borderValue=self.border_val ) results['img'][view_id] = img translate_mats.append(translate_matrix) scale_mats.append(scaling_matrix) warp_mats.append(warp_matrix) # results['img_shape'] = img.shape results['valid_shape'] = np.array(valid_shapes) results['img_scale_ratios'] = np.array(scaling_ratios) if 'gt_bboxes' in results: results = self._transform_bbox(results, warp_mats, flips, width, height) results = self._transform_camera(results, warp_mats, flips, width) return results def __repr__(self): repr_str = self.__class__.__name__ # repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' repr_str += f'scaling_ratio={self.scaling_ratio_range}, ' repr_str += f'flip_ratio={self.flip_ratio}, ' repr_str += f'border={self.border}, ' repr_str += f'border_val={self.border_val}, ' return repr_str @staticmethod def _get_scaling_matrix(scale_ratio): scaling_matrix = np.array( [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], dtype=np.float32) return scaling_matrix @staticmethod def _get_translation_matrix(x, y): translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], dtype=np.float32) return translation_matrix @staticmethod def _get_flip_matrix(flip, width): if flip: flip_matrix = np.array([ [-1, 0, width], [0, 1, 0], [0, 0, 1] ]) else: flip_matrix = np.eye(3) return flip_matrix @PIPELINES.register_module() class PhotoMetricDistortionMultiViewImage: """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18, swap_channel=True): self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta self.swap_channel = swap_channel def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ imgs = results['img'] new_imgs = [] for img_ in imgs: img = img_.astype(np.float32) assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if random.randint(2): delta = random.uniform(-self.brightness_delta, self.brightness_delta) img += delta img = np.clip(img, a_max=255, a_min=0) # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = random.randint(2) if mode == 1: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha img = np.clip(img, a_max=255, a_min=0) # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if random.randint(2): img[..., 1] *= random.uniform(self.saturation_lower, self.saturation_upper) img[..., 1] = np.clip(img[..., 1], a_max=1, a_min=0) # random hue if random.randint(2): img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if mode == 0: if random.randint(2): # import pdb # pdb.set_trace() alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # import pdb # pdb.set_trace() img = np.clip(img, a_max=255, a_min=0) # randomly swap channels if self.swap_channel: if random.randint(2): img = img[..., random.permutation(3)] new_imgs.append(img.astype(np.uint8)) results['img'] = new_imgs return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str ================================================ FILE: mmdet3d/datasets/pipelines/transforms_3d.py ================================================ import numpy as np from mmcv import is_tuple_of from mmcv.utils import build_from_cfg from mmdet3d.core import VoxelGenerator from mmdet3d.core.bbox import box_np_ops from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import RandomFlip from ..registry import OBJECTSAMPLERS from .data_augment_utils import noise_per_object_v3_ @PIPELINES.register_module() class RandomFlip3D(RandomFlip): """Flip the points & bbox. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: sync_2d (bool, optional): Whether to apply flip according to the 2D images. If True, it will apply the same flip as that to 2D images. If False, it will decide whether to flip randomly and independently to that of 2D images. Defaults to True. flip_ratio_bev_horizontal (float, optional): The flipping probability in horizontal direction. Defaults to 0.0. flip_ratio_bev_vertical (float, optional): The flipping probability in vertical direction. Defaults to 0.0. """ def __init__(self, sync_2d=True, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0, **kwargs): super(RandomFlip3D, self).__init__( flip_ratio=flip_ratio_bev_horizontal, **kwargs) self.sync_2d = sync_2d self.flip_ratio_bev_vertical = flip_ratio_bev_vertical if flip_ratio_bev_horizontal is not None: assert isinstance( flip_ratio_bev_horizontal, (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1 if flip_ratio_bev_vertical is not None: assert isinstance( flip_ratio_bev_vertical, (int, float)) and 0 <= flip_ratio_bev_vertical <= 1 def random_flip_data_3d(self, input_dict, direction='horizontal'): """Flip 3D data randomly. Args: input_dict (dict): Result dict from loading pipeline. direction (str): Flip direction. Default: horizontal. Returns: dict: Flipped results, 'points', 'bbox3d_fields' keys are \ updated in the result dict. """ assert direction in ['horizontal', 'vertical'] if len(input_dict['bbox3d_fields']) == 0: # test mode input_dict['bbox3d_fields'].append('empty_box3d') input_dict['empty_box3d'] = input_dict['box_type_3d']( np.array([], dtype=np.float32)) assert len(input_dict['bbox3d_fields']) == 1 for key in input_dict['bbox3d_fields']: input_dict['points'] = input_dict[key].flip( direction, points=input_dict['points']) def __call__(self, input_dict): """Call function to flip points, values in the ``bbox3d_fields`` and \ also flip 2D image and its annotations. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Flipped results, 'flip', 'flip_direction', \ 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \ into result dict. """ # filp 2D image and its annotations super(RandomFlip3D, self).__call__(input_dict) if self.sync_2d: input_dict['pcd_horizontal_flip'] = input_dict['flip'] input_dict['pcd_vertical_flip'] = False else: if 'pcd_horizontal_flip' not in input_dict: flip_horizontal = True if np.random.rand( ) < self.flip_ratio else False input_dict['pcd_horizontal_flip'] = flip_horizontal if 'pcd_vertical_flip' not in input_dict: flip_vertical = True if np.random.rand( ) < self.flip_ratio_bev_vertical else False input_dict['pcd_vertical_flip'] = flip_vertical if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] if input_dict['pcd_horizontal_flip']: self.random_flip_data_3d(input_dict, 'horizontal') input_dict['transformation_3d_flow'].extend(['HF']) if input_dict['pcd_vertical_flip']: self.random_flip_data_3d(input_dict, 'vertical') input_dict['transformation_3d_flow'].extend(['VF']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(sync_2d={},'.format(self.sync_2d) repr_str += 'flip_ratio_bev_vertical={})'.format( self.flip_ratio_bev_vertical) return repr_str @PIPELINES.register_module() class OurRandomFlip3D(object): """Flip the points & bbox. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: sync_2d (bool, optional): Whether to apply flip according to the 2D images. If True, it will apply the same flip as that to 2D images. If False, it will decide whether to flip randomly and independently to that of 2D images. Defaults to True. flip_ratio_bev_horizontal (float, optional): The flipping probability in horizontal direction. Defaults to 0.0. flip_ratio_bev_vertical (float, optional): The flipping probability in vertical direction. Defaults to 0.0. """ def __init__(self, sync_2d=True, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0, **kwargs): # super(OurRandomFlip3D, self).__init__( # flip_ratio=flip_ratio_bev_horizontal, **kwargs) self.sync_2d = sync_2d self.flip_ratio = flip_ratio_bev_horizontal self.flip_ratio_bev_vertical = flip_ratio_bev_vertical if flip_ratio_bev_horizontal is not None: assert isinstance( flip_ratio_bev_horizontal, (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1 if flip_ratio_bev_vertical is not None: assert isinstance( flip_ratio_bev_vertical, (int, float)) and 0 <= flip_ratio_bev_vertical <= 1 def random_flip_data_3d(self, input_dict, direction='horizontal'): """Flip 3D data randomly. Args: input_dict (dict): Result dict from loading pipeline. direction (str): Flip direction. Default: horizontal. Returns: dict: Flipped results, 'points', 'bbox3d_fields' keys are \ updated in the result dict. """ assert direction in ['horizontal', 'vertical'] if len(input_dict['bbox3d_fields']) == 0: # test mode input_dict['bbox3d_fields'].append('empty_box3d') input_dict['empty_box3d'] = input_dict['box_type_3d']( np.array([], dtype=np.float32)) assert len(input_dict['bbox3d_fields']) == 1 for key in input_dict['bbox3d_fields']: input_dict['points'] = input_dict[key].flip( direction, points=input_dict['points']) if direction == 'horizontal': diag = np.ones(3) diag[1] = -1 elif direction == 'vertical': diag = np.ones(3) diag[0] = -1 matrix = np.diag(diag) for id in range(len(input_dict['lidar2cam_r'])): input_dict['lidar2cam_r'][id] = input_dict['lidar2cam_r'][id] @ matrix if 'gt_pts_centers_view' in input_dict and input_dict['gt_pts_centers_view'].shape[0] > 0: input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] @ matrix if 'gt_bboxes_lidar_view' in input_dict: input_dict['gt_bboxes_lidar_view'].flip(direction) def __call__(self, input_dict): """Call function to flip points, values in the ``bbox3d_fields`` and \ also flip 2D image and its annotations. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Flipped results, 'flip', 'flip_direction', \ 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \ into result dict. """ # filp 2D image and its annotations # super(OurRandomFlip3D, self).__call__(input_dict) if self.sync_2d: input_dict['pcd_horizontal_flip'] = input_dict['flip'] input_dict['pcd_vertical_flip'] = False else: if 'pcd_horizontal_flip' not in input_dict: flip_horizontal = True if np.random.rand( ) < self.flip_ratio else False input_dict['pcd_horizontal_flip'] = flip_horizontal if 'pcd_vertical_flip' not in input_dict: flip_vertical = True if np.random.rand( ) < self.flip_ratio_bev_vertical else False input_dict['pcd_vertical_flip'] = flip_vertical if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] if input_dict['pcd_horizontal_flip']: self.random_flip_data_3d(input_dict, 'horizontal') input_dict['transformation_3d_flow'].extend(['HF']) if input_dict['pcd_vertical_flip']: self.random_flip_data_3d(input_dict, 'vertical') input_dict['transformation_3d_flow'].extend(['VF']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(sync_2d={},'.format(self.sync_2d) repr_str += 'flip_ratio_bev_vertical={})'.format( self.flip_ratio_bev_vertical) return repr_str @PIPELINES.register_module() class ObjectSample(object): """Sample GT objects to the data. Args: db_sampler (dict): Config dict of the database sampler. sample_2d (bool): Whether to also paste 2D image patch to the images This should be true when applying multi-modality cut-and-paste. Defaults to False. """ def __init__(self, db_sampler, sample_2d=False): self.sampler_cfg = db_sampler self.sample_2d = sample_2d if 'type' not in db_sampler.keys(): db_sampler['type'] = 'DataBaseSampler' self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS) @staticmethod def remove_points_in_boxes(points, boxes): """Remove the points in the sampled bounding boxes. Args: points (np.ndarray): Input point cloud array. boxes (np.ndarray): Sampled ground truth boxes. Returns: np.ndarray: Points with those in the boxes removed. """ masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes) points = points[np.logical_not(masks.any(-1))] return points def __call__(self, input_dict): """Call function to sample ground truth objects to the data. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after object sampling augmentation, \ 'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated \ in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] # change to float for blending operation points = input_dict['points'] if self.sample_2d: img = input_dict['img'] gt_bboxes_2d = input_dict['gt_bboxes'] # Assume for now 3D & 2D bboxes are the same sampled_dict = self.db_sampler.sample_all( gt_bboxes_3d.tensor.numpy(), gt_labels_3d, gt_bboxes_2d=gt_bboxes_2d, img=img) else: sampled_dict = self.db_sampler.sample_all( gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None) if sampled_dict is not None: sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d'] sampled_points = sampled_dict['points'] sampled_gt_labels = sampled_dict['gt_labels_3d'] gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels], axis=0) gt_bboxes_3d = gt_bboxes_3d.new_box( np.concatenate( [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d])) points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d) # check the points dimension points = points.cat([sampled_points, points]) if self.sample_2d: sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d'] gt_bboxes_2d = np.concatenate( [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32) input_dict['gt_bboxes'] = gt_bboxes_2d input_dict['img'] = sampled_dict['img'] input_dict['gt_bboxes_3d'] = gt_bboxes_3d input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.long) input_dict['points'] = points return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f' sample_2d={self.sample_2d},' repr_str += f' data_root={self.sampler_cfg.data_root},' repr_str += f' info_path={self.sampler_cfg.info_path},' repr_str += f' rate={self.sampler_cfg.rate},' repr_str += f' prepare={self.sampler_cfg.prepare},' repr_str += f' classes={self.sampler_cfg.classes},' repr_str += f' sample_groups={self.sampler_cfg.sample_groups}' return repr_str @PIPELINES.register_module() class ObjectNoise(object): """Apply noise to each GT objects in the scene. Args: translation_std (list[float], optional): Standard deviation of the distribution where translation noise are sampled from. Defaults to [0.25, 0.25, 0.25]. global_rot_range (list[float], optional): Global rotation to the scene. Defaults to [0.0, 0.0]. rot_range (list[float], optional): Object rotation range. Defaults to [-0.15707963267, 0.15707963267]. num_try (int, optional): Number of times to try if the noise applied is invalid. Defaults to 100. """ def __init__(self, translation_std=[0.25, 0.25, 0.25], global_rot_range=[0.0, 0.0], rot_range=[-0.15707963267, 0.15707963267], num_try=100): self.translation_std = translation_std self.global_rot_range = global_rot_range self.rot_range = rot_range self.num_try = num_try def __call__(self, input_dict): """Call function to apply noise to each ground truth in the scene. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after adding noise to each object, \ 'points', 'gt_bboxes_3d' keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] points = input_dict['points'] # TODO: check this inplace function numpy_box = gt_bboxes_3d.tensor.numpy() numpy_points = points.tensor.numpy() noise_per_object_v3_( numpy_box, numpy_points, rotation_perturb=self.rot_range, center_noise_std=self.translation_std, global_random_rot_range=self.global_rot_range, num_try=self.num_try) input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box) input_dict['points'] = points.new_point(numpy_points) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(num_try={},'.format(self.num_try) repr_str += ' translation_std={},'.format(self.translation_std) repr_str += ' global_rot_range={},'.format(self.global_rot_range) repr_str += ' rot_range={})'.format(self.rot_range) return repr_str @PIPELINES.register_module() class GlobalRotScaleTrans(object): """Apply global rotation, scaling and translation to a 3D scene. Args: rot_range (list[float]): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). scale_ratio_range (list[float]): Range of scale ratio. Defaults to [0.95, 1.05]. translation_std (list[float]): The standard deviation of ranslation noise. This apply random translation to a scene by a noise, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0] shift_height (bool): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ def __init__(self, rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0], shift_height=False): self.rot_range = rot_range self.scale_ratio_range = scale_ratio_range self.translation_std = translation_std self.shift_height = shift_height def _trans_bbox_points(self, input_dict): """Private function to translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after translation, 'points', 'pcd_trans' \ and keys in input_dict['bbox3d_fields'] are updated \ in the result dict. """ if not isinstance(self.translation_std, (list, tuple, np.ndarray)): translation_std = [ self.translation_std, self.translation_std, self.translation_std ] else: translation_std = self.translation_std translation_std = np.array(translation_std, dtype=np.float32) trans_factor = np.random.normal(scale=translation_std, size=3).T input_dict['points'].translate(trans_factor) input_dict['pcd_trans'] = trans_factor for key in input_dict['bbox3d_fields']: input_dict[key].translate(trans_factor) def _rot_bbox_points(self, input_dict): """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after rotation, 'points', 'pcd_rotation' \ and keys in input_dict['bbox3d_fields'] are updated \ in the result dict. """ rotation = self.rot_range if not isinstance(rotation, list): rotation = [-rotation, rotation] noise_rotation = np.random.uniform(rotation[0], rotation[1]) for key in input_dict['bbox3d_fields']: if len(input_dict[key].tensor) != 0: points, rot_mat_T = input_dict[key].rotate( noise_rotation, input_dict['points']) input_dict['points'] = points input_dict['pcd_rotation'] = rot_mat_T # input_dict['points_instance'].rotate(noise_rotation) def _scale_bbox_points(self, input_dict): """Private function to scale bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points'and keys in \ input_dict['bbox3d_fields'] are updated in the result dict. """ scale = input_dict['pcd_scale_factor'] points = input_dict['points'] points.scale(scale) if self.shift_height: assert 'height' in points.attribute_dims.keys() points.tensor[:, points.attribute_dims['height']] *= scale input_dict['points'] = points for key in input_dict['bbox3d_fields']: input_dict[key].scale(scale) def _random_scale(self, input_dict): """Private function to randomly set the scale factor. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'pcd_scale_factor' are updated \ in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], self.scale_ratio_range[1]) input_dict['pcd_scale_factor'] = scale_factor def __call__(self, input_dict): """Private function to rotate, scale and translate bounding boxes and \ points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans' and keys in \ input_dict['bbox3d_fields'] are updated in the result dict. """ if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] self._rot_bbox_points(input_dict) if 'pcd_scale_factor' not in input_dict: self._random_scale(input_dict) self._scale_bbox_points(input_dict) self._trans_bbox_points(input_dict) input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(rot_range={},'.format(self.rot_range) repr_str += ' scale_ratio_range={},'.format(self.scale_ratio_range) repr_str += ' translation_std={})'.format(self.translation_std) repr_str += ' shift_height={})'.format(self.shift_height) return repr_str @PIPELINES.register_module() class OurGlobalRotScaleTrans(object): """Apply global rotation, scaling and translation to a 3D scene. Args: rot_range (list[float]): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). scale_ratio_range (list[float]): Range of scale ratio. Defaults to [0.95, 1.05]. translation_std (list[float]): The standard deviation of ranslation noise. This apply random translation to a scene by a noise, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0] shift_height (bool): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ def __init__(self, rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0], shift_height=False): self.rot_range = rot_range self.scale_ratio_range = scale_ratio_range self.translation_std = translation_std self.shift_height = shift_height def _trans_bbox_points(self, input_dict): """Private function to translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after translation, 'points', 'pcd_trans' \ and keys in input_dict['bbox3d_fields'] are updated \ in the result dict. """ if not isinstance(self.translation_std, (list, tuple, np.ndarray)): translation_std = [ self.translation_std, self.translation_std, self.translation_std ] else: translation_std = self.translation_std translation_std = np.array(translation_std, dtype=np.float32) trans_factor = np.random.normal(scale=translation_std, size=3).T input_dict['points'].translate(trans_factor) input_dict['pcd_trans'] = trans_factor for key in input_dict['bbox3d_fields']: input_dict[key].translate(trans_factor) for id in range(len(input_dict['lidar2cam_t'])): input_dict['lidar2cam_t'][id] = input_dict['lidar2cam_t'][id] - input_dict['lidar2cam_r'][id] @ trans_factor if 'gt_pts_centers_view' in input_dict: input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] + trans_factor if 'gt_bboxes_lidar_view' in input_dict: input_dict['gt_bboxes_lidar_view'].translate(trans_factor) def _rot_bbox_points(self, input_dict): """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after rotation, 'points', 'pcd_rotation' \ and keys in input_dict['bbox3d_fields'] are updated \ in the result dict. """ noise_rotation = input_dict['pcd_rotation_angle'] rot_mat_T = None for key in input_dict['bbox3d_fields']: if len(input_dict[key].tensor) != 0: points, rot_mat_T = input_dict[key].rotate( noise_rotation, input_dict['points']) input_dict['points'] = points input_dict['pcd_rotation'] = rot_mat_T if rot_mat_T is not None: rot_mat_T_np = rot_mat_T.numpy() for id in range(len(input_dict['lidar2cam_r'])): input_dict['lidar2cam_r'][id] = input_dict['lidar2cam_r'][id] @ rot_mat_T_np if input_dict['gt_pts_centers_view'].shape[0] > 0: input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] @ rot_mat_T_np if 'gt_bboxes_lidar_view' in input_dict: input_dict['gt_bboxes_lidar_view'].rotate(noise_rotation) def _scale_bbox_points(self, input_dict): """Private function to scale bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points'and keys in \ input_dict['bbox3d_fields'] are updated in the result dict. """ scale = input_dict['pcd_scale_factor'] points = input_dict['points'] points.scale(scale) if self.shift_height: assert 'height' in points.attribute_dims.keys() points.tensor[:, points.attribute_dims['height']] *= scale input_dict['points'] = points for key in input_dict['bbox3d_fields']: input_dict[key].scale(scale) if 'gt_img_centers_view' in input_dict and input_dict['gt_img_centers_view'].shape[0] > 0: input_dict['gt_img_centers_view'][:, 2] *= scale for id in range(len(input_dict['lidar2cam_t'])): input_dict['lidar2cam_t'][id] = input_dict['lidar2cam_t'][id] * scale if 'gt_pts_centers_view' in input_dict and input_dict['gt_pts_centers_view'].shape[0] > 0: input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] * scale if 'gt_bboxes_cam_view' in input_dict: input_dict['gt_bboxes_cam_view'].scale(scale) if 'gt_bboxes_lidar_view' in input_dict: input_dict['gt_bboxes_lidar_view'].scale(scale) def _random_scale(self, input_dict): """Private function to randomly set the scale factor. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'pcd_scale_factor' are updated \ in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], self.scale_ratio_range[1]) input_dict['pcd_scale_factor'] = scale_factor def _random_rotation(self, input_dict): rotation = self.rot_range if not isinstance(rotation, list): rotation = [-rotation, rotation] noise_rotation = np.random.uniform(rotation[0], rotation[1]) input_dict['pcd_rotation_angle'] = noise_rotation def __call__(self, input_dict): """Private function to rotate, scale and translate bounding boxes and \ points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans' and keys in \ input_dict['bbox3d_fields'] are updated in the result dict. """ if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] if 'pcd_rotation_angle' not in input_dict: self._random_rotation(input_dict) self._rot_bbox_points(input_dict) if 'pcd_scale_factor' not in input_dict: self._random_scale(input_dict) self._scale_bbox_points(input_dict) self._trans_bbox_points(input_dict) input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(rot_range={},'.format(self.rot_range) repr_str += ' scale_ratio_range={},'.format(self.scale_ratio_range) repr_str += ' translation_std={})'.format(self.translation_std) repr_str += ' shift_height={})'.format(self.shift_height) return repr_str @PIPELINES.register_module() class PointShuffle(object): """Shuffle input points.""" def __call__(self, input_dict): """Call function to shuffle points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points' keys are updated \ in the result dict. """ input_dict['points'].shuffle() return input_dict def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class ObjectRangeFilter(object): """Filter objects by the range. Args: point_cloud_range (list[float]): Point cloud range. """ def __init__(self, point_cloud_range): self.pcd_range = np.array(point_cloud_range, dtype=np.float32) self.bev_range = self.pcd_range[[0, 1, 3, 4]] def __call__(self, input_dict): """Call function to filter objects by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] mask = gt_bboxes_3d.in_range_bev(self.bev_range) gt_bboxes_3d = gt_bboxes_3d[mask] # mask is a torch tensor but gt_labels_3d is still numpy array # using mask to index gt_labels_3d will cause bug when # len(gt_labels_3d) == 1, where mask=1 will be interpreted # as gt_labels_3d[1] and cause out of index error gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)] # limit rad to [-pi, pi] gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi) input_dict['gt_bboxes_3d'] = gt_bboxes_3d input_dict['gt_labels_3d'] = gt_labels_3d return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist()) return repr_str @PIPELINES.register_module() class OurObjectRangeFilter(object): """Filter objects by the range. Args: point_cloud_range (list[float]): Point cloud range. """ def __init__(self, point_cloud_range): self.pcd_range = np.array(point_cloud_range, dtype=np.float32) self.bev_range = self.pcd_range[[0, 1, 3, 4]] def __call__(self, input_dict): """Call function to filter objects by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] mask = gt_bboxes_3d.in_range_bev(self.bev_range) gt_bboxes_3d = gt_bboxes_3d[mask] # mask is a torch tensor but gt_labels_3d is still numpy array # using mask to index gt_labels_3d will cause bug when # len(gt_labels_3d) == 1, where mask=1 will be interpreted # as gt_labels_3d[1] and cause out of index error gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)] # limit rad to [-pi, pi] gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi) input_dict['gt_bboxes_3d'] = gt_bboxes_3d input_dict['gt_labels_3d'] = gt_labels_3d if 'gt_visible_3d' in input_dict: gt_visible_3d = input_dict['gt_visible_3d'] gt_visible_3d = gt_visible_3d[mask.numpy().astype(np.bool)] input_dict['gt_visible_3d'] = gt_visible_3d pts_2d = input_dict['gt_pts_centers_view'] mask_2d = (pts_2d[:, 0] > self.bev_range[0]) & (pts_2d[:, 0] < self.bev_range[2]) & (pts_2d[:, 1] > self.bev_range[1]) & (pts_2d[:, 1] < self.bev_range[3]) input_dict['gt_bboxes'] = input_dict['gt_bboxes'][mask_2d] input_dict['gt_labels'] = input_dict['gt_labels'][mask_2d] input_dict['gt_img_centers_view'] = input_dict['gt_img_centers_view'][mask_2d] input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'][mask_2d] if 'gt_bboxes_cam_view' in input_dict: input_dict['gt_bboxes_cam_view'] = input_dict['gt_bboxes_cam_view'][mask_2d] input_dict['gt_bboxes_lidar_view'] = input_dict['gt_bboxes_lidar_view'][mask_2d] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist()) return repr_str @PIPELINES.register_module() class PointsRangeFilter(object): """Filter points by the range. Args: point_cloud_range (list[float]): Point cloud range. """ def __init__(self, point_cloud_range): self.pcd_range = np.array(point_cloud_range, dtype=np.float32) def __call__(self, input_dict): """Call function to filter points by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points' keys are updated \ in the result dict. """ points = input_dict['points'] points_mask = points.in_range_3d(self.pcd_range) clean_points = points[points_mask] input_dict['points'] = clean_points return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist()) return repr_str @PIPELINES.register_module() class ObjectNameFilter(object): """Filter GT objects by their names. Args: classes (list[str]): List of class names to be kept for training. """ def __init__(self, classes): self.classes = classes self.labels = list(range(len(self.classes))) def __call__(self, input_dict): """Call function to filter objects by their names. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ keys are updated in the result dict. """ gt_labels_3d = input_dict['gt_labels_3d'] gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d], dtype=np.bool_) input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask] input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask] if 'gt_visible_3d' in input_dict: input_dict['gt_visible_3d'] = input_dict['gt_visible_3d'][gt_bboxes_mask] if 'gt_labels' in input_dict: gt_labels = input_dict['gt_labels'] if gt_labels.shape[0] > 0: gt_bboxes_mask = np.array([n[0] in self.labels for n in gt_labels], dtype=np.bool_) input_dict['gt_bboxes'] = input_dict['gt_bboxes'][gt_bboxes_mask] input_dict['gt_labels'] = input_dict['gt_labels'][gt_bboxes_mask] if 'gt_img_centers_view' in input_dict: input_dict['gt_img_centers_view'] = input_dict['gt_img_centers_view'][gt_bboxes_mask] input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'][gt_bboxes_mask] if 'gt_bboxes_cam_view' in input_dict: input_dict['gt_bboxes_cam_view'] = input_dict['gt_bboxes_cam_view'][gt_bboxes_mask] if 'gt_bboxes_lidar_view' in input_dict: input_dict['gt_bboxes_lidar_view'] = input_dict['gt_bboxes_lidar_view'][gt_bboxes_mask] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(classes={self.classes})' return repr_str @PIPELINES.register_module() class IndoorPointSample(object): """Indoor point sample. Sampling data to a certain number. Args: name (str): Name of the dataset. num_points (int): Number of points to be sampled. """ def __init__(self, num_points): self.num_points = num_points def points_random_sampling(self, points, num_samples, replace=None, return_choices=False): """Points random sampling. Sample points to a certain number. Args: points (np.ndarray): 3D Points. num_samples (int): Number of samples to be sampled. replace (bool): Whether the sample is with or without replacement. Defaults to None. return_choices (bool): Whether return choice. Defaults to False. Returns: tuple[np.ndarray] | np.ndarray: - points (np.ndarray): 3D Points. - choices (np.ndarray, optional): The generated random samples. """ if replace is None: replace = (points.shape[0] < num_samples) choices = np.random.choice( points.shape[0], num_samples, replace=replace) if return_choices: return points[choices], choices else: return points[choices] def __call__(self, results): """Call function to sample points to in indoor scenes. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after sampling, 'points', 'pts_instance_mask' \ and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] points, choices = self.points_random_sampling( points, self.num_points, return_choices=True) pts_instance_mask = results.get('pts_instance_mask', None) pts_semantic_mask = results.get('pts_semantic_mask', None) results['points'] = points if pts_instance_mask is not None and pts_semantic_mask is not None: pts_instance_mask = pts_instance_mask[choices] pts_semantic_mask = pts_semantic_mask[choices] results['pts_instance_mask'] = pts_instance_mask results['pts_semantic_mask'] = pts_semantic_mask return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(num_points={})'.format(self.num_points) return repr_str @PIPELINES.register_module() class BackgroundPointsFilter(object): """Filter background points near the bounding box. Args: bbox_enlarge_range (tuple[float], float): Bbox enlarge range. """ def __init__(self, bbox_enlarge_range): assert (is_tuple_of(bbox_enlarge_range, float) and len(bbox_enlarge_range) == 3) \ or isinstance(bbox_enlarge_range, float), \ f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}' if isinstance(bbox_enlarge_range, float): bbox_enlarge_range = [bbox_enlarge_range] * 3 self.bbox_enlarge_range = np.array( bbox_enlarge_range, dtype=np.float32)[np.newaxis, :] def __call__(self, input_dict): """Call function to filter points by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points' keys are updated \ in the result dict. """ points = input_dict['points'] gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_bboxes_3d_np = gt_bboxes_3d.tensor.numpy() gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.numpy() enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy() enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range points_numpy = points.tensor.numpy() foreground_masks = box_np_ops.points_in_rbbox(points_numpy, gt_bboxes_3d_np) enlarge_foreground_masks = box_np_ops.points_in_rbbox( points_numpy, enlarged_gt_bboxes_3d) foreground_masks = foreground_masks.max(1) enlarge_foreground_masks = enlarge_foreground_masks.max(1) valid_masks = ~np.logical_and(~foreground_masks, enlarge_foreground_masks) input_dict['points'] = points[valid_masks] pts_instance_mask = input_dict.get('pts_instance_mask', None) if pts_instance_mask is not None: input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks] pts_semantic_mask = input_dict.get('pts_semantic_mask', None) if pts_semantic_mask is not None: input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += '(bbox_enlarge_range={})'.format( self.bbox_enlarge_range.tolist()) return repr_str @PIPELINES.register_module() class VoxelBasedPointSampler(object): """Voxel based point sampler. Apply voxel sampling to multiple sweep points. Args: cur_sweep_cfg (dict): Config for sampling current points. prev_sweep_cfg (dict): Config for sampling previous points. time_dim (int): Index that indicate the time dimention for input points. """ def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3): self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg) self.cur_voxel_num = self.cur_voxel_generator._max_voxels self.time_dim = time_dim if prev_sweep_cfg is not None: assert prev_sweep_cfg['max_num_points'] == \ cur_sweep_cfg['max_num_points'] self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg) self.prev_voxel_num = self.prev_voxel_generator._max_voxels else: self.prev_voxel_generator = None self.prev_voxel_num = 0 def _sample_points(self, points, sampler, point_dim): """Sample points for each points subset. Args: points (np.ndarray): Points subset to be sampled. sampler (VoxelGenerator): Voxel based sampler for each points subset. point_dim (int): The dimention of each points Returns: np.ndarray: Sampled points. """ voxels, coors, num_points_per_voxel = sampler.generate(points) if voxels.shape[0] < sampler._max_voxels: padding_points = np.zeros([ sampler._max_voxels - voxels.shape[0], sampler._max_num_points, point_dim ], dtype=points.dtype) padding_points[:] = voxels[0] sample_points = np.concatenate([voxels, padding_points], axis=0) else: sample_points = voxels return sample_points def __call__(self, results): """Call function to sample points from multiple sweeps. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after sampling, 'points', 'pts_instance_mask' \ and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] original_dim = points.shape[1] # TODO: process instance and semantic mask while _max_num_points # is larger than 1 # Extend points with seg and mask fields map_fields2dim = [] start_dim = original_dim points_numpy = points.tensor.numpy() extra_channel = [points_numpy] for idx, key in enumerate(results['pts_mask_fields']): map_fields2dim.append((key, idx + start_dim)) extra_channel.append(results[key][..., None]) start_dim += len(results['pts_mask_fields']) for idx, key in enumerate(results['pts_seg_fields']): map_fields2dim.append((key, idx + start_dim)) extra_channel.append(results[key][..., None]) points_numpy = np.concatenate(extra_channel, axis=-1) # Split points into two part, current sweep points and # previous sweeps points. # TODO: support different sampling methods for next sweeps points # and previous sweeps points. cur_points_flag = (points_numpy[:, self.time_dim] == 0) cur_sweep_points = points_numpy[cur_points_flag] prev_sweeps_points = points_numpy[~cur_points_flag] if prev_sweeps_points.shape[0] == 0: prev_sweeps_points = cur_sweep_points # Shuffle points before sampling np.random.shuffle(cur_sweep_points) np.random.shuffle(prev_sweeps_points) cur_sweep_points = self._sample_points(cur_sweep_points, self.cur_voxel_generator, points_numpy.shape[1]) if self.prev_voxel_generator is not None: prev_sweeps_points = self._sample_points(prev_sweeps_points, self.prev_voxel_generator, points_numpy.shape[1]) points_numpy = np.concatenate( [cur_sweep_points, prev_sweeps_points], 0) else: points_numpy = cur_sweep_points if self.cur_voxel_generator._max_num_points == 1: points_numpy = points_numpy.squeeze(1) results['points'] = points.new_point(points_numpy[..., :original_dim]) # Restore the correspoinding seg and mask fields for key, dim_index in map_fields2dim: results[key] = points_numpy[..., dim_index] return results def __repr__(self): """str: Return a string that describes the module.""" def _auto_indent(repr_str, indent): repr_str = repr_str.split('\n') repr_str = [' ' * indent + t + '\n' for t in repr_str] repr_str = ''.join(repr_str)[:-1] return repr_str repr_str = self.__class__.__name__ indent = 4 repr_str += '(\n' repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n' repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n' repr_str += ' ' * indent + f'time_dim={self.time_dim},\n' repr_str += ' ' * indent + 'cur_voxel_generator=\n' repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n' repr_str += ' ' * indent + 'prev_voxel_generator=\n' repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})' return repr_str ================================================ FILE: mmdet3d/datasets/registry.py ================================================ from mmcv.utils import Registry OBJECTSAMPLERS = Registry('Object sampler') ================================================ FILE: mmdet3d/datasets/scannet_dataset.py ================================================ import numpy as np from os import path as osp from mmdet3d.core import show_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.datasets import DATASETS from .custom_3d import Custom3DDataset @DATASETS.register_module() class ScanNetDataset(Custom3DDataset): r"""ScanNet Dataset. This class serves as the API for experiments on the ScanNet Dataset. Please refer to the `github repo `_ for data downloading. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Depth' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='Depth', filter_empty_gt=True, test_mode=False): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \ 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] if info['annos']['gt_num'] != 0: gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype( np.float32) # k, 6 gt_labels_3d = info['annos']['class'].astype(np.long) else: gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32) gt_labels_3d = np.zeros((0, ), dtype=np.long) # to target box structure gt_bboxes_3d = DepthInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], with_yaw=False, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) pts_instance_mask_path = osp.join(self.data_root, info['pts_instance_mask_path']) pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, pts_instance_mask_path=pts_instance_mask_path, pts_semantic_mask_path=pts_semantic_mask_path) return anns_results def show(self, results, out_dir, show=True): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. """ assert out_dir is not None, 'Expect out_dir, got none.' for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points = np.fromfile( osp.join(self.data_root, pts_path), dtype=np.float32).reshape(-1, 6) gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor pred_bboxes = result['boxes_3d'].tensor.numpy() show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name, show) ================================================ FILE: mmdet3d/datasets/semantickitti_dataset.py ================================================ from os import path as osp from mmdet.datasets import DATASETS from .custom_3d import Custom3DDataset @DATASETS.register_module() class SemanticKITTIDataset(Custom3DDataset): r"""SemanticKITTI Dataset. This class serves as the API for experiments on the SemanticKITTI Dataset Please refer to `_ for data downloading Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): NO 3D box for this dataset. You can choose any type Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person', 'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building', 'fence', 'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='Lidar', filter_empty_gt=False, test_mode=False): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path) return anns_results ================================================ FILE: mmdet3d/datasets/sunrgbd_dataset.py ================================================ import numpy as np from collections import OrderedDict from os import path as osp from mmdet3d.core import show_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.core import eval_map from mmdet.datasets import DATASETS from .custom_3d import Custom3DDataset @DATASETS.register_module() class SUNRGBDDataset(Custom3DDataset): r"""SUNRGBD Dataset. This class serves as the API for experiments on the SUNRGBD Dataset. See the `download page `_ for data downloading. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Depth' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=dict(use_camera=True, use_lidar=True), box_type_3d='Depth', filter_empty_gt=True, test_mode=False): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) assert 'use_camera' in self.modality and \ 'use_lidar' in self.modality assert self.modality['use_camera'] or self.modality['use_lidar'] def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str, optional): Filename of point clouds. - file_name (str, optional): Filename of point clouds. - img_prefix (str | None, optional): Prefix of image files. - img_info (dict, optional): Image info. - calib (dict, optional): Camera calibration info. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] assert info['point_cloud']['lidar_idx'] == info['image']['image_idx'] input_dict = dict(sample_idx=sample_idx) if self.modality['use_lidar']: pts_filename = osp.join(self.data_root, info['pts_path']) input_dict['pts_filename'] = pts_filename input_dict['file_name'] = pts_filename if self.modality['use_camera']: img_filename = osp.join( osp.join(self.data_root, 'sunrgbd_trainval'), info['image']['image_path']) input_dict['img_prefix'] = None input_dict['img_info'] = dict(filename=img_filename) calib = info['calib'] input_dict['calib'] = calib if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0: return None return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \ 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] if info['annos']['gt_num'] != 0: gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype( np.float32) # k, 6 gt_labels_3d = info['annos']['class'].astype(np.long) else: gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32) gt_labels_3d = np.zeros((0, ), dtype=np.long) # to target box structure gt_bboxes_3d = DepthInstance3DBoxes( gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d) if self.modality['use_camera']: if info['annos']['gt_num'] != 0: gt_bboxes_2d = info['annos']['bbox'].astype(np.float32) else: gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32) anns_results['bboxes'] = gt_bboxes_2d anns_results['labels'] = gt_labels_3d return anns_results def show(self, results, out_dir, show=True): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. """ assert out_dir is not None, 'Expect out_dir, got none.' for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points = np.fromfile( osp.join(self.data_root, pts_path), dtype=np.float32).reshape(-1, 6) points[:, 3:] *= 255 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor pred_bboxes = result['boxes_3d'].tensor.numpy() show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name, show) def evaluate(self, results, metric=None, iou_thr=(0.25, 0.5), iou_thr_2d=(0.5, ), logger=None, show=False, out_dir=None): # evaluate 3D detection performance if isinstance(results[0], dict): return super().evaluate(results, metric, iou_thr, logger, show, out_dir) # evaluate 2D detection performance else: eval_results = OrderedDict() annotations = [self.get_ann_info(i) for i in range(len(self))] iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d, float) else iou_thr_2d for iou_thr_2d_single in iou_thr_2d: mean_ap, _ = eval_map( results, annotations, scale_ranges=None, iou_thr=iou_thr_2d_single, dataset=self.CLASSES, logger=logger) eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap return eval_results ================================================ FILE: mmdet3d/datasets/waymo_dataset.py ================================================ import mmcv import numpy as np import os import tempfile import torch from mmcv.utils import print_log from os import path as osp from mmdet.datasets import DATASETS from ..core.bbox import Box3DMode, points_cam2img from .kitti_dataset import KittiDataset @DATASETS.register_module() class WaymoDataset(KittiDataset): """Waymo Dataset. This class serves as the API for experiments on the Waymo Dataset. Please refer to ``_for data downloading. It is recommended to symlink the dataset root to $MMDETECTION3D/data and organize them as the doc shows. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. split (str): Split of input data. pts_prefix (str, optional): Prefix of points files. Defaults to 'velodyne'. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': box in LiDAR coordinates - 'Depth': box in depth coordinates, usually for indoor dataset - 'Camera': box in camera coordinates filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. pcd_limit_range (list): The range of point cloud used to filter invalid predicted boxes. Default: [-85, -85, -5, 85, 85, 5]. """ CLASSES = ('Car', 'Cyclist', 'Pedestrian') def __init__(self, data_root, ann_file, split, num_views=5, pts_prefix='velodyne', pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, load_interval=1, pcd_limit_range=[-85, -85, -5, 85, 85, 5]): super().__init__( data_root=data_root, ann_file=ann_file, split=split, pts_prefix=pts_prefix, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, pcd_limit_range=pcd_limit_range) self.num_views = num_views assert self.num_views <= 5 # to load a subset, just set the load_interval in the dataset config self.data_infos = self.data_infos[::load_interval] if hasattr(self, 'flag'): self.flag = self.flag[::load_interval] def _get_pts_filename(self, idx): pts_filename = osp.join(self.root_split, self.pts_prefix, f'{idx:07d}.bin') return pts_filename def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Standard input_dict consists of the data information. - sample_idx (str): sample index - pts_filename (str): filename of point clouds - img_prefix (str | None): prefix of image files - img_info (dict): image info - lidar2img (list[np.ndarray], optional): transformations from lidar to different cameras - ann_info (dict): annotation info """ info = self.data_infos[index] sample_idx = info['image']['image_idx'] img_filename = os.path.join(self.data_root, info['image']['image_path']) # TODO: consider use torch.Tensor only rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P0 = info['calib']['P0'].astype(np.float32) lidar2img = P0 @ rect @ Trv2c # the Tr_velo_to_cam is computed for all images but not saved in .info for img1-4 # the size of img0-2: 1280x1920; img3-4: 886x1920 if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] # load calibration for all 5 images. calib_path = img_filename.replace('image_0', 'calib').replace('.png', '.txt') Tr_velo_to_cam_list = [] with open(calib_path, 'r') as f: lines = f.readlines() for line_num in range(6, 6 + self.num_views): trans = np.array([float(info) for info in lines[line_num].split(' ')[1:13]]).reshape(3, 4) trans = np.concatenate([trans, np.array([[0., 0., 0., 1.]])], axis=0).astype(np.float32) Tr_velo_to_cam_list.append(trans) assert np.allclose(Tr_velo_to_cam_list[0], info['calib']['Tr_velo_to_cam'].astype(np.float32)) for idx_img in range(self.num_views): rect = info['calib']['R0_rect'].astype(np.float32) # Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) Trv2c = Tr_velo_to_cam_list[idx_img] P0 = info['calib'][f'P{idx_img}'].astype(np.float32) lidar2img = P0 @ rect @ Trv2c image_paths.append(img_filename.replace('image_0', f'image_{idx_img}')) lidar2img_rts.append(lidar2img) pts_filename = self._get_pts_filename(sample_idx) input_dict = dict( sample_idx=sample_idx, pts_filename=pts_filename, img_prefix=None, ) if self.modality['use_camera']: input_dict['img_filename'] = image_paths input_dict['lidar2img'] = lidar2img_rts if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None, data_format='waymo'): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str | None): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str | None): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. data_format (str | None): Output data format. Default: 'waymo'. Another supported choice is 'kitti'. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None assert ('waymo' in data_format or 'kitti' in data_format), \ f'invalid data_format {data_format}' if (not isinstance(outputs[0], dict)) or 'img_bbox' in outputs[0]: raise TypeError('Not supported type for reformat results.') elif 'pts_bbox' in outputs[0]: result_files = dict() for name in outputs[0]: results_ = [out[name] for out in outputs] pklfile_prefix_ = pklfile_prefix + name if submission_prefix is not None: submission_prefix_ = f'{submission_prefix}_{name}' else: submission_prefix_ = None result_files_ = self.bbox2result_kitti(results_, self.CLASSES, pklfile_prefix_, submission_prefix_) result_files[name] = result_files_ else: result_files = self.bbox2result_kitti(outputs, self.CLASSES, pklfile_prefix, submission_prefix) if 'waymo' in data_format: from ..core.evaluation.waymo_utils.prediction_kitti_to_waymo import \ KITTI2Waymo # noqa waymo_root = osp.join( self.data_root.split('kitti_format')[0], 'waymo_format') if self.split == 'training': waymo_tfrecords_dir = osp.join(waymo_root, 'validation') prefix = '1' elif self.split == 'testing': waymo_tfrecords_dir = osp.join(waymo_root, 'testing') prefix = '2' else: raise ValueError('Not supported split value.') save_tmp_dir = tempfile.TemporaryDirectory() waymo_results_save_dir = save_tmp_dir.name waymo_results_final_path = f'{pklfile_prefix}.bin' if 'pts_bbox' in result_files: converter = KITTI2Waymo(result_files['pts_bbox'], waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix) else: converter = KITTI2Waymo(result_files, waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix) converter.convert() save_tmp_dir.cleanup() return result_files, tmp_dir def evaluate(self, results, metric='waymo', logger=None, pklfile_prefix=None, submission_prefix=None, show=False, out_dir=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default: 'waymo'. Another supported metric is 'kitti'. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. pklfile_prefix (str | None): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str | None): The prefix of submission datas. If not specified, the submission data will not be generated. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. Returns: dict[str: float]: results of each evaluation metric """ assert ('waymo' in metric or 'kitti' in metric), \ f'invalid metric {metric}' if 'kitti' in metric: result_files, tmp_dir = self.format_results( results, pklfile_prefix, submission_prefix, data_format='kitti') from mmdet3d.core.evaluation import kitti_eval gt_annos = [info['annos'] for info in self.data_infos] if isinstance(result_files, dict): ap_dict = dict() for name, result_files_ in result_files.items(): eval_types = ['bev', '3d'] ap_result_str, ap_dict_ = kitti_eval( gt_annos, result_files_, self.CLASSES, eval_types=eval_types) for ap_type, ap in ap_dict_.items(): ap_dict[f'{name}/{ap_type}'] = float( '{:.4f}'.format(ap)) print_log( f'Results of {name}:\n' + ap_result_str, logger=logger) else: ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bev', '3d']) print_log('\n' + ap_result_str, logger=logger) if 'waymo' in metric: waymo_root = osp.join( self.data_root.split('kitti_format')[0], 'waymo_format') if pklfile_prefix is None: eval_tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(eval_tmp_dir.name, 'results') else: eval_tmp_dir = None result_files, tmp_dir = self.format_results( results, pklfile_prefix, submission_prefix, data_format='waymo') import subprocess ret_bytes = subprocess.check_output( 'mmdet3d/core/evaluation/waymo_utils/' + f'compute_detection_metrics_main {pklfile_prefix}.bin ' + f'{waymo_root}/gt.bin', shell=True) ret_texts = ret_bytes.decode('utf-8') print_log(ret_texts) # parse the text to get ap_dict ap_dict = { 'Vehicle/L1 mAP': 0, 'Vehicle/L1 mAPH': 0, 'Vehicle/L2 mAP': 0, 'Vehicle/L2 mAPH': 0, 'Pedestrian/L1 mAP': 0, 'Pedestrian/L1 mAPH': 0, 'Pedestrian/L2 mAP': 0, 'Pedestrian/L2 mAPH': 0, 'Sign/L1 mAP': 0, 'Sign/L1 mAPH': 0, 'Sign/L2 mAP': 0, 'Sign/L2 mAPH': 0, 'Cyclist/L1 mAP': 0, 'Cyclist/L1 mAPH': 0, 'Cyclist/L2 mAP': 0, 'Cyclist/L2 mAPH': 0, 'Overall/L1 mAP': 0, 'Overall/L1 mAPH': 0, 'Overall/L2 mAP': 0, 'Overall/L2 mAPH': 0 } mAP_splits = ret_texts.split('mAP ') mAPH_splits = ret_texts.split('mAPH ') for idx, key in enumerate(ap_dict.keys()): split_idx = int(idx / 2) + 1 if idx % 2 == 0: # mAP ap_dict[key] = float(mAP_splits[split_idx].split(']')[0]) else: # mAPH ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0]) ap_dict['Overall/L1 mAP'] = \ (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] + ap_dict['Cyclist/L1 mAP']) / 3 ap_dict['Overall/L1 mAPH'] = \ (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] + ap_dict['Cyclist/L1 mAPH']) / 3 ap_dict['Overall/L2 mAP'] = \ (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] + ap_dict['Cyclist/L2 mAP']) / 3 ap_dict['Overall/L2 mAPH'] = \ (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] + ap_dict['Cyclist/L2 mAPH']) / 3 if eval_tmp_dir is not None: eval_tmp_dir.cleanup() if tmp_dir is not None: tmp_dir.cleanup() if show: self.show(results, out_dir) return ap_dict def bbox2result_kitti(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert results to kitti format for evaluation and test submission. Args: net_outputs (List[np.ndarray]): list of array storing the bbox and score class_nanes (List[String]): A list of class names pklfile_prefix (str | None): The prefix of pkl file. submission_prefix (str | None): The prefix of submission file. Returns: List[dict]: A list of dict have the kitti 3d format """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' if submission_prefix is not None: mmcv.mkdir_or_exist(submission_prefix) det_annos = [] print('\nConverting prediction to KITTI format') for idx, pred_dicts in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] info = self.data_infos[idx] sample_idx = info['image']['image_idx'] image_shape = info['image']['image_shape'][:2] box_dict = self.convert_valid_bboxes(pred_dicts, info) if len(box_dict['bbox']) > 0: box_2d_preds = box_dict['bbox'] box_preds = box_dict['box3d_camera'] scores = box_dict['scores'] box_preds_lidar = box_dict['box3d_lidar'] label_preds = box_dict['label_preds'] anno = { 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [], 'score': [] } for box, box_lidar, bbox, score, label in zip( box_preds, box_preds_lidar, box_2d_preds, scores, label_preds): bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) bbox[:2] = np.maximum(bbox[:2], [0, 0]) anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append( -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6]) anno['bbox'].append(bbox) anno['dimensions'].append(box[3:6]) anno['location'].append(box[:3]) anno['rotation_y'].append(box[6]) anno['score'].append(score) anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) if submission_prefix is not None: curr_file = f'{submission_prefix}/{sample_idx:07d}.txt' with open(curr_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'. format(anno['name'][idx], anno['alpha'][idx], bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3], dims[idx][1], dims[idx][2], dims[idx][0], loc[idx][0], loc[idx][1], loc[idx][2], anno['rotation_y'][idx], anno['score'][idx]), file=f) else: annos.append({ 'name': np.array([]), 'truncated': np.array([]), 'occluded': np.array([]), 'alpha': np.array([]), 'bbox': np.zeros([0, 4]), 'dimensions': np.zeros([0, 3]), 'location': np.zeros([0, 3]), 'rotation_y': np.array([]), 'score': np.array([]), }) annos[-1]['sample_idx'] = np.array( [sample_idx] * len(annos[-1]['score']), dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print(f'Result is saved to {out}.') return det_annos def convert_valid_bboxes(self, box_dict, info): """Convert the boxes into valid format. Args: box_dict (dict): Bounding boxes to be converted. - boxes_3d (:obj:``LiDARInstance3DBoxes``): 3D bounding boxes. - scores_3d (np.ndarray): Scores of predicted boxes. - labels_3d (np.ndarray): Class labels of predicted boxes. info (dict): Dataset information dictionary. Returns: dict: Valid boxes after conversion. - bbox (np.ndarray): 2D bounding boxes (in camera 0). - box3d_camera (np.ndarray): 3D boxes in camera coordinates. - box3d_lidar (np.ndarray): 3D boxes in lidar coordinates. - scores (np.ndarray): Scores of predicted boxes. - label_preds (np.ndarray): Class labels of predicted boxes. - sample_idx (np.ndarray): Sample index. """ # TODO: refactor this function box_preds = box_dict['boxes_3d'] scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] # TODO: remove the hack of yaw box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P0 = info['calib']['P0'].astype(np.float32) P0 = box_preds.tensor.new_tensor(P0) box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c) box_corners = box_preds_camera.corners box_corners_in_image = points_cam2img(box_corners, P0) # box_corners_in_image: [N, 8, 2] minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] box_2d_preds = torch.cat([minxy, maxxy], dim=1) # Post-processing # check box_preds limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range) valid_pcd_inds = ((box_preds.center > limit_range[:3]) & (box_preds.center < limit_range[3:])) valid_inds = valid_pcd_inds.all(-1) if valid_inds.sum() > 0: return dict( bbox=box_2d_preds[valid_inds, :].numpy(), box3d_camera=box_preds_camera[valid_inds].tensor.numpy(), box3d_lidar=box_preds[valid_inds].tensor.numpy(), scores=scores[valid_inds].numpy(), label_preds=labels[valid_inds].numpy(), sample_idx=sample_idx, ) else: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx, ) ================================================ FILE: mmdet3d/models/__init__.py ================================================ from .backbones import * # noqa: F401,F403 from .builder import (build_backbone, build_detector, build_fusion_layer, build_head, build_loss, build_middle_encoder, build_neck, build_roi_extractor, build_shared_head, build_voxel_encoder) from .dense_heads import * # noqa: F401,F403 from .detectors import * # noqa: F401,F403 from .fusion_layers import * # noqa: F401,F403 from .losses import * # noqa: F401,F403 from .middle_encoders import * # noqa: F401,F403 from .model_utils import * # noqa: F401,F403 from .necks import * # noqa: F401,F403 from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS from .roi_heads import * # noqa: F401,F403 from .voxel_encoders import * # noqa: F401,F403 __all__ = [ 'VOXEL_ENCODERS', 'MIDDLE_ENCODERS', 'FUSION_LAYERS', 'build_backbone', 'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head', 'build_loss', 'build_detector', 'build_fusion_layer', 'build_middle_encoder', 'build_voxel_encoder' ] ================================================ FILE: mmdet3d/models/backbones/DLA.py ================================================ import torch import torch.nn as nn import torch.utils.checkpoint as cp from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer, constant_init, kaiming_init) from mmcv.runner import load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from mmdet.utils import get_root_logger from ..builder import BACKBONES try: from dcn_v2 import DCN # from .DCNv2.dcn_v2 import DCN except: print('import DCN failed') DCN = None import numpy as np import math BN_MOMENTUM = 0.1 def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): def __init__(self, inplanes, planes, stride=1, dilation=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 2 def __init__(self, inplanes, planes, stride=1, dilation=1): super(Bottleneck, self).__init__() expansion = Bottleneck.expansion bottle_planes = planes // expansion self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class BottleneckX(nn.Module): expansion = 2 cardinality = 32 def __init__(self, inplanes, planes, stride=1, dilation=1): super(BottleneckX, self).__init__() cardinality = BottleneckX.cardinality # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) # bottle_planes = dim * cardinality bottle_planes = planes * cardinality // 32 self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation, groups=cardinality) self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class Root(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, residual): super(Root, self).__init__() self.conv = nn.Conv2d( in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.residual = residual def forward(self, *x): children = x x = self.conv(torch.cat(x, 1)) x = self.bn(x) if self.residual: x += children[0] x = self.relu(x) return x class Tree(nn.Module): def __init__(self, levels, block, in_channels, out_channels, stride=1, level_root=False, root_dim=0, root_kernel_size=1, dilation=1, root_residual=False): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.tree1 = block(in_channels, out_channels, stride, dilation=dilation) self.tree2 = block(out_channels, out_channels, 1, dilation=dilation) else: self.tree1 = Tree(levels - 1, block, in_channels, out_channels, stride, root_dim=0, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) self.tree2 = Tree(levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) if levels == 1: self.root = Root(root_dim, out_channels, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) ) def forward(self, x, residual=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.levels == 1: x2 = self.tree2(x1) x = self.root(x2, x1, *children) else: children.append(x1) x = self.tree2(x1, children=children) return x class DLA(nn.Module): def __init__(self, levels, channels, num_classes=1000, block=BasicBlock, residual_root=False, linear_root=False, opt=None): super(DLA, self).__init__() self.channels = channels self.num_classes = num_classes self.base_layer = nn.Sequential( nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), nn.ReLU(inplace=True)) self.level0 = self._make_conv_level( channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root) self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root) self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root) self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root) if opt.pre_img: self.pre_img_layer = nn.Sequential( nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), nn.ReLU(inplace=True)) if opt.pre_hm: self.pre_hm_layer = nn.Sequential( nn.Conv2d(1, channels[0], kernel_size=7, stride=1, padding=3, bias=False), nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), nn.ReLU(inplace=True)) # for m in self.modules(): # if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) # elif isinstance(m, nn.BatchNorm2d): # m.weight.data.fill_(1) # m.bias.data.zero_() def _make_level(self, block, inplanes, planes, blocks, stride=1): downsample = None if stride != 1 or inplanes != planes: downsample = nn.Sequential( nn.MaxPool2d(stride, stride=stride), nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(inplanes, planes, stride, downsample=downsample)) for i in range(1, blocks): layers.append(block(inplanes, planes)) return nn.Sequential(*layers) def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): modules = [] for i in range(convs): modules.extend([ nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), nn.ReLU(inplace=True)]) inplanes = planes return nn.Sequential(*modules) def forward(self, x, pre_img=None, pre_hm=None): y = [] x = self.base_layer(x) if pre_img is not None: x = x + self.pre_img_layer(pre_img) if pre_hm is not None: x = x + self.pre_hm_layer(pre_hm) for i in range(6): x = getattr(self, 'level{}'.format(i))(x) y.append(x) return y def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): # fc = self.fc if name.endswith('.pth'): model_weights = torch.load(data + name) else: model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) num_classes = len(model_weights[list(model_weights.keys())[-1]]) self.fc = nn.Conv2d( self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) self.load_state_dict(model_weights, strict=False) # self.fc = fc def dla34(pretrained=True, **kwargs): # DLA-34 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock, **kwargs) if pretrained: model.load_pretrained_model( data='imagenet', name='dla34', hash='ba72cf86') else: print('Warning: No ImageNet pretrain!!') return model def dla102(pretrained=None, **kwargs): # DLA-102 Bottleneck.expansion = 2 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, residual_root=True, **kwargs) if pretrained: model.load_pretrained_model( data='imagenet', name='dla102', hash='d94d9790') return model def dla46_c(pretrained=None, **kwargs): # DLA-46-C Bottleneck.expansion = 2 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=Bottleneck, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla46_c', hash='2bfd52c3') return model def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla46x_c', hash='d761bae7') return model def dla60x_c(pretrained=None, **kwargs): # DLA-X-60-C BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla60x_c', hash='b870c45c') return model def dla60(pretrained=None, **kwargs): # DLA-60 Bottleneck.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla60', hash='24839fc4') return model def dla60x(pretrained=None, **kwargs): # DLA-X-60 BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla60x', hash='d15cacda') return model def dla102x(pretrained=None, **kwargs): # DLA-X-102 BottleneckX.expansion = 2 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla102x', hash='ad62be81') return model def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64 BottleneckX.cardinality = 64 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla102x2', hash='262837b6') return model def dla169(pretrained=None, **kwargs): # DLA-169 Bottleneck.expansion = 2 model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model( data='imagenet', name='dla169', hash='0914e092') return model class Identity(nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x): return x def fill_fc_weights(layers): for m in layers.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: nn.init.constant_(m.bias, 0) def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class Conv(nn.Module): def __init__(self, chi, cho): super(Conv, self).__init__() self.conv = nn.Sequential( nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), nn.ReLU(inplace=True)) def forward(self, x): return self.conv(x) class GlobalConv(nn.Module): def __init__(self, chi, cho, k=7, d=1): super(GlobalConv, self).__init__() gcl = nn.Sequential( nn.Conv2d(chi, cho, kernel_size=(k, 1), stride=1, bias=False, dilation=d, padding=(d * (k // 2), 0)), nn.Conv2d(cho, cho, kernel_size=(1, k), stride=1, bias=False, dilation=d, padding=(0, d * (k // 2)))) gcr = nn.Sequential( nn.Conv2d(chi, cho, kernel_size=(1, k), stride=1, bias=False, dilation=d, padding=(0, d * (k // 2))), nn.Conv2d(cho, cho, kernel_size=(k, 1), stride=1, bias=False, dilation=d, padding=(d * (k // 2), 0))) fill_fc_weights(gcl) fill_fc_weights(gcr) self.gcl = gcl self.gcr = gcr self.act = nn.Sequential( nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), nn.ReLU(inplace=True) ) def forward(self, x): x = self.gcl(x) + self.gcr(x) x = self.act(x) return x class DeformConv(nn.Module): def __init__(self, chi, cho): super(DeformConv, self).__init__() self.actf = nn.Sequential( nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), nn.ReLU(inplace=True) ) self.conv = DCN(chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1) def forward(self, x): x = self.conv(x) x = self.actf(x) return x class IDAUp(nn.Module): def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)): super(IDAUp, self).__init__() for i in range(1, len(channels)): c = channels[i] f = int(up_f[i]) proj = node_type[0](c, o) node = node_type[1](o, o) up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) setattr(self, 'node_' + str(i), node) def forward(self, layers, startp, endp): for i in range(startp + 1, endp): upsample = getattr(self, 'up_' + str(i - startp)) project = getattr(self, 'proj_' + str(i - startp)) layers[i] = upsample(project(layers[i])) node = getattr(self, 'node_' + str(i - startp)) layers[i] = node(layers[i] + layers[i - 1]) class DLAUp(nn.Module): def __init__(self, startp, channels, scales, in_channels=None, node_type=DeformConv): super(DLAUp, self).__init__() self.startp = startp if in_channels is None: in_channels = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr(self, 'ida_{}'.format(i), IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], node_type=node_type)) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, layers): out = [layers[-1]] # start with 32 for i in range(len(layers) - self.startp - 1): ida = getattr(self, 'ida_{}'.format(i)) ida(layers, len(layers) - i - 2, len(layers)) out.insert(0, layers[-1]) return out class Interpolate(nn.Module): def __init__(self, scale, mode): super(Interpolate, self).__init__() self.scale = scale self.mode = mode def forward(self, x): x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) return x DLA_NODE = { 'dcn': (DeformConv, DeformConv), 'gcn': (Conv, GlobalConv), 'conv': (Conv, Conv), } class BaseModel(nn.Module): def __init__(self, heads, head_convs, num_stacks, last_channel, opt=None): super(BaseModel, self).__init__() if opt is not None and opt.head_kernel != 3: print('Using head kernel:', opt.head_kernel) head_kernel = opt.head_kernel else: head_kernel = 3 self.num_stacks = num_stacks self.heads = heads for head in self.heads: classes = self.heads[head] head_conv = head_convs[head] if len(head_conv) > 0: out = nn.Conv2d(head_conv[-1], classes, kernel_size=1, stride=1, padding=0, bias=True) conv = nn.Conv2d(last_channel, head_conv[0], kernel_size=head_kernel, padding=head_kernel // 2, bias=True) convs = [conv] for k in range(1, len(head_conv)): convs.append(nn.Conv2d(head_conv[k - 1], head_conv[k], kernel_size=1, bias=True)) if len(convs) == 1: fc = nn.Sequential(conv, nn.ReLU(inplace=True), out) elif len(convs) == 2: fc = nn.Sequential( convs[0], nn.ReLU(inplace=True), convs[1], nn.ReLU(inplace=True), out) elif len(convs) == 3: fc = nn.Sequential( convs[0], nn.ReLU(inplace=True), convs[1], nn.ReLU(inplace=True), convs[2], nn.ReLU(inplace=True), out) elif len(convs) == 4: fc = nn.Sequential( convs[0], nn.ReLU(inplace=True), convs[1], nn.ReLU(inplace=True), convs[2], nn.ReLU(inplace=True), convs[3], nn.ReLU(inplace=True), out) if 'hm' in head: fc[-1].bias.data.fill_(opt.prior_bias) else: fill_fc_weights(fc) else: fc = nn.Conv2d(last_channel, classes, kernel_size=1, stride=1, padding=0, bias=True) if 'hm' in head: fc.bias.data.fill_(opt.prior_bias) else: fill_fc_weights(fc) self.__setattr__(head, fc) def img2feats(self, x): raise NotImplementedError def imgpre2feats(self, x, pre_img=None, pre_hm=None): raise NotImplementedError def forward(self, x, pre_img=None, pre_hm=None): if (pre_hm is not None) or (pre_img is not None): feats = self.imgpre2feats(x, pre_img, pre_hm) else: feats = self.img2feats(x) return feats # out = [] # if self.opt.model_output_list: # for s in range(self.num_stacks): # z = [] # for head in sorted(self.heads): # z.append(self.__getattr__(head)(feats[s])) # out.append(z) # else: # for s in range(self.num_stacks): # z = {} # for head in self.heads: # z[head] = self.__getattr__(head)(feats[s]) # out.append(z) # return out @BACKBONES.register_module() class DLASeg(BaseModel): def __init__(self, num_layers, heads, head_convs): opt = Opt() super(DLASeg, self).__init__( heads, head_convs, 1, 64 if num_layers == 34 else 128, opt=opt) down_ratio = 4 self.opt = opt self.node_type = DLA_NODE[opt.dla_node] print('Using node type:', self.node_type) self.first_level = int(np.log2(down_ratio)) self.last_level = 5 self.base = globals()['dla{}'.format(num_layers)](pretrained=False, opt=opt) channels = self.base.channels scales = [2 ** i for i in range(len(channels[self.first_level:]))] self.dla_up = DLAUp( self.first_level, channels[self.first_level:], scales, node_type=self.node_type) out_channel = channels[self.first_level] self.ida_up = IDAUp( out_channel, channels[self.first_level:self.last_level], [2 ** i for i in range(self.last_level - self.first_level)], node_type=self.node_type) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) else: pass def img2feats(self, x): x = self.base(x) x = self.dla_up(x) y = [] for i in range(self.last_level - self.first_level): y.append(x[i].clone()) self.ida_up(y, 0, len(y)) return [y[-1]] def imgpre2feats(self, x, pre_img=None, pre_hm=None): x = self.base(x, pre_img, pre_hm) x = self.dla_up(x) y = [] for i in range(self.last_level - self.first_level): y.append(x[i].clone()) self.ida_up(y, 0, len(y)) return [y[-1]] class Opt: head_kernel = 3 levels = [1, 1, 1, 2, 2, 1] channels = [16, 32, 64, 128, 256, 512] pre_img = False pre_hm = False dla_node = 'dcn' model_output_list = False # if __name__ == '__main__': # from mmdet.models import DLASeg # opt = Opt() # model = DLASeg(34, {}, -1, Opt) # checkpoints = torch.load('checkpoints/nuScenes_3Ddetection_e140.pth') # model.load_state_dict(checkpoints['state_dict'], strict=False) ================================================ FILE: mmdet3d/models/backbones/__init__.py ================================================ from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt from .multi_backbone import MultiBackbone from .nostem_regnet import NoStemRegNet from .pointnet2_sa_msg import PointNet2SAMSG from .pointnet2_sa_ssg import PointNet2SASSG from .second import SECOND from .DLA import DLASeg from .swin import SwinTransformer __all__ = [ 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet', 'SECOND', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone', 'DLASeg', 'SwinTransformer' ] ================================================ FILE: mmdet3d/models/backbones/base_pointnet.py ================================================ from abc import ABCMeta from mmcv.runner import load_checkpoint from torch import nn as nn class BasePointNet(nn.Module, metaclass=ABCMeta): """Base class for PointNet.""" def __init__(self): super(BasePointNet, self).__init__() self.fp16_enabled = False def init_weights(self, pretrained=None): """Initialize the weights of PointNet backbone.""" # Do not initialize the conv layers # to follow the original implementation if isinstance(pretrained, str): from mmdet3d.utils import get_root_logger logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) @staticmethod def _split_point_feats(points): """Split coordinates and features of input points. Args: points (torch.Tensor): Point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. """ xyz = points[..., 0:3].contiguous() if points.size(-1) > 3: features = points[..., 3:].transpose(1, 2).contiguous() else: features = None return xyz, features ================================================ FILE: mmdet3d/models/backbones/multi_backbone.py ================================================ import copy import torch from mmcv.cnn import ConvModule from mmcv.runner import auto_fp16, load_checkpoint from torch import nn as nn from mmdet.models import BACKBONES, build_backbone @BACKBONES.register_module() class MultiBackbone(nn.Module): """MultiBackbone with different configs. Args: num_streams (int): The number of backbones. backbones (list or dict): A list of backbone configs. aggregation_mlp_channels (list[int]): Specify the mlp layers for feature aggregation. conv_cfg (dict): Config dict of convolutional layers. norm_cfg (dict): Config dict of normalization layers. act_cfg (dict): Config dict of activation layers. suffixes (list): A list of suffixes to rename the return dict for each backbone. """ def __init__(self, num_streams, backbones, aggregation_mlp_channels=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), act_cfg=dict(type='ReLU'), suffixes=('net0', 'net1'), **kwargs): super().__init__() assert isinstance(backbones, dict) or isinstance(backbones, list) if isinstance(backbones, dict): backbones_list = [] for ind in range(num_streams): backbones_list.append(copy.deepcopy(backbones)) backbones = backbones_list assert len(backbones) == num_streams assert len(suffixes) == num_streams self.backbone_list = nn.ModuleList() # Rename the ret_dict with different suffixs. self.suffixes = suffixes out_channels = 0 for backbone_cfg in backbones: out_channels += backbone_cfg['fp_channels'][-1][-1] self.backbone_list.append(build_backbone(backbone_cfg)) # Feature aggregation layers if aggregation_mlp_channels is None: aggregation_mlp_channels = [ out_channels, out_channels // 2, out_channels // len(self.backbone_list) ] else: aggregation_mlp_channels.insert(0, out_channels) self.aggregation_layers = nn.Sequential() for i in range(len(aggregation_mlp_channels) - 1): self.aggregation_layers.add_module( f'layer{i}', ConvModule( aggregation_mlp_channels[i], aggregation_mlp_channels[i + 1], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) def init_weights(self, pretrained=None): """Initialize the weights of PointNet++ backbone.""" # Do not initialize the conv layers # to follow the original implementation if isinstance(pretrained, str): from mmdet3d.utils import get_root_logger logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) @auto_fp16() def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, list[torch.Tensor]]: Outputs from multiple backbones. - fp_xyz[suffix] (list[torch.Tensor]): The coordinates of each fp features. - fp_features[suffix] (list[torch.Tensor]): The features from each Feature Propagate Layers. - fp_indices[suffix] (list[torch.Tensor]): Indices of the input points. - hd_feature (torch.Tensor): The aggregation feature from multiple backbones. """ ret = {} fp_features = [] for ind in range(len(self.backbone_list)): cur_ret = self.backbone_list[ind](points) cur_suffix = self.suffixes[ind] fp_features.append(cur_ret['fp_features'][-1]) if cur_suffix != '': for k in cur_ret.keys(): cur_ret[k + '_' + cur_suffix] = cur_ret.pop(k) ret.update(cur_ret) # Combine the features here hd_feature = torch.cat(fp_features, dim=1) hd_feature = self.aggregation_layers(hd_feature) ret['hd_feature'] = hd_feature return ret ================================================ FILE: mmdet3d/models/backbones/nostem_regnet.py ================================================ from mmdet.models.backbones import RegNet from ..builder import BACKBONES @BACKBONES.register_module() class NoStemRegNet(RegNet): """RegNet backbone without Stem for 3D detection. More details can be found in `paper `_ . Args: arch (dict): The parameter of RegNets. - w0 (int): Initial width. - wa (float): Slope of width. - wm (float): Quantization parameter to quantize the width. - depth (int): Depth of the backbone. - group_w (int): Width of group. - bot_mul (float): Bottleneck ratio, i.e. expansion of bottlneck. strides (Sequence[int]): Strides of the first block of each stage. base_channels (int): Base channels after stem layer. in_channels (int): Number of input image channels. Normally 3. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. norm_cfg (dict): Dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): Whether to use zero init for last norm layer in resblocks to let them behave as identity. Example: >>> from mmdet3d.models import NoStemRegNet >>> import torch >>> self = NoStemRegNet( arch=dict( w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0)) >>> self.eval() >>> inputs = torch.rand(1, 64, 16, 16) >>> level_outputs = self.forward(inputs) >>> for level_out in level_outputs: ... print(tuple(level_out.shape)) (1, 96, 8, 8) (1, 192, 4, 4) (1, 432, 2, 2) (1, 1008, 1, 1) """ def __init__(self, arch, **kwargs): super(NoStemRegNet, self).__init__(arch, **kwargs) def _make_stem_layer(self, in_channels, base_channels): """Override the original function that do not initialize a stem layer since 3D detector's voxel encoder works like a stem layer.""" return def forward(self, x): """Forward function of backbone. Args: x (torch.Tensor): Features in shape (N, C, H, W). Returns: tuple[torch.Tensor]: Multi-scale features. """ outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) return tuple(outs) ================================================ FILE: mmdet3d/models/backbones/pointnet2_sa_msg.py ================================================ import torch from mmcv.cnn import ConvModule from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import build_sa_module from mmdet.models import BACKBONES from .base_pointnet import BasePointNet @BACKBONES.register_module() class PointNet2SAMSG(BasePointNet): """PointNet2 with Multi-scale grouping. Args: in_channels (int): Input channels of point cloud. num_points (tuple[int]): The number of points which each SA module samples. radii (tuple[float]): Sampling radii of each SA module. num_samples (tuple[int]): The number of samples for ball query in each SA module. sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module. aggregation_channels (tuple[int]): Out channels of aggregation multi-scale grouping features. fps_mods (tuple[int]): Mod of FPS for each SA module. fps_sample_range_lists (tuple[tuple[int]]): The number of sampling points which each SA module samples. dilated_group (tuple[bool]): Whether to use dilated ball query for out_indices (Sequence[int]): Output from which stages. norm_cfg (dict): Config of normalization layer. sa_cfg (dict): Config of set abstraction module, which may contain the following keys and values: - pool_mod (str): Pool method ('max' or 'avg') for SA modules. - use_xyz (bool): Whether to use xyz as a part of features. - normalize_xyz (bool): Whether to normalize xyz with radii in each SA module. """ def __init__(self, in_channels, num_points=(2048, 1024, 512, 256), radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 64, 128), (64, 96, 128)), ((128, 128, 256), (128, 192, 256), (128, 256, 256))), aggregation_channels=(64, 128, 256), fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), fps_sample_range_lists=((-1), (-1), (512, -1)), dilated_group=(True, True, True), out_indices=(2, ), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)): super().__init__() self.num_sa = len(sa_channels) self.out_indices = out_indices assert max(out_indices) < self.num_sa assert len(num_points) == len(radii) == len(num_samples) == len( sa_channels) == len(aggregation_channels) self.SA_modules = nn.ModuleList() self.aggregation_mlps = nn.ModuleList() sa_in_channel = in_channels - 3 # number of channels without xyz skip_channel_list = [sa_in_channel] for sa_index in range(self.num_sa): cur_sa_mlps = list(sa_channels[sa_index]) sa_out_channel = 0 for radius_index in range(len(radii[sa_index])): cur_sa_mlps[radius_index] = [sa_in_channel] + list( cur_sa_mlps[radius_index]) sa_out_channel += cur_sa_mlps[radius_index][-1] if isinstance(fps_mods[sa_index], tuple): cur_fps_mod = list(fps_mods[sa_index]) else: cur_fps_mod = list([fps_mods[sa_index]]) if isinstance(fps_sample_range_lists[sa_index], tuple): cur_fps_sample_range_list = list( fps_sample_range_lists[sa_index]) else: cur_fps_sample_range_list = list( [fps_sample_range_lists[sa_index]]) self.SA_modules.append( build_sa_module( num_point=num_points[sa_index], radii=radii[sa_index], sample_nums=num_samples[sa_index], mlp_channels=cur_sa_mlps, fps_mod=cur_fps_mod, fps_sample_range_list=cur_fps_sample_range_list, dilated_group=dilated_group[sa_index], norm_cfg=norm_cfg, cfg=sa_cfg, bias=True)) skip_channel_list.append(sa_out_channel) self.aggregation_mlps.append( ConvModule( sa_out_channel, aggregation_channels[sa_index], conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), kernel_size=1, bias=True)) sa_in_channel = aggregation_channels[sa_index] @auto_fp16(apply_to=('points', )) def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, torch.Tensor]: Outputs of the last SA module. - sa_xyz (torch.Tensor): The coordinates of sa features. - sa_features (torch.Tensor): The features from the last Set Aggregation Layers. - sa_indices (torch.Tensor): Indices of the \ input points. """ xyz, features = self._split_point_feats(points) batch, num_points = xyz.shape[:2] indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat( batch, 1).long() sa_xyz = [xyz] sa_features = [features] sa_indices = [indices] out_sa_xyz = [] out_sa_features = [] out_sa_indices = [] for i in range(self.num_sa): cur_xyz, cur_features, cur_indices = self.SA_modules[i]( sa_xyz[i], sa_features[i]) cur_features = self.aggregation_mlps[i](cur_features) sa_xyz.append(cur_xyz) sa_features.append(cur_features) sa_indices.append( torch.gather(sa_indices[-1], 1, cur_indices.long())) if i in self.out_indices: out_sa_xyz.append(sa_xyz[-1]) out_sa_features.append(sa_features[-1]) out_sa_indices.append(sa_indices[-1]) return dict( sa_xyz=out_sa_xyz, sa_features=out_sa_features, sa_indices=out_sa_indices) ================================================ FILE: mmdet3d/models/backbones/pointnet2_sa_ssg.py ================================================ import torch from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import PointFPModule, build_sa_module from mmdet.models import BACKBONES from .base_pointnet import BasePointNet @BACKBONES.register_module() class PointNet2SASSG(BasePointNet): """PointNet2 with Single-scale grouping. Args: in_channels (int): Input channels of point cloud. num_points (tuple[int]): The number of points which each SA module samples. radius (tuple[float]): Sampling radii of each SA module. num_samples (tuple[int]): The number of samples for ball query in each SA module. sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module. fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module. norm_cfg (dict): Config of normalization layer. sa_cfg (dict): Config of set abstraction module, which may contain the following keys and values: - pool_mod (str): Pool method ('max' or 'avg') for SA modules. - use_xyz (bool): Whether to use xyz as a part of features. - normalize_xyz (bool): Whether to normalize xyz with radii in each SA module. """ def __init__(self, in_channels, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)): super().__init__() self.num_sa = len(sa_channels) self.num_fp = len(fp_channels) assert len(num_points) == len(radius) == len(num_samples) == len( sa_channels) assert len(sa_channels) >= len(fp_channels) self.SA_modules = nn.ModuleList() sa_in_channel = in_channels - 3 # number of channels without xyz skip_channel_list = [sa_in_channel] for sa_index in range(self.num_sa): cur_sa_mlps = list(sa_channels[sa_index]) cur_sa_mlps = [sa_in_channel] + cur_sa_mlps sa_out_channel = cur_sa_mlps[-1] self.SA_modules.append( build_sa_module( num_point=num_points[sa_index], radius=radius[sa_index], num_sample=num_samples[sa_index], mlp_channels=cur_sa_mlps, norm_cfg=norm_cfg, cfg=sa_cfg)) skip_channel_list.append(sa_out_channel) sa_in_channel = sa_out_channel self.FP_modules = nn.ModuleList() fp_source_channel = skip_channel_list.pop() fp_target_channel = skip_channel_list.pop() for fp_index in range(len(fp_channels)): cur_fp_mlps = list(fp_channels[fp_index]) cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps)) if fp_index != len(fp_channels) - 1: fp_source_channel = cur_fp_mlps[-1] fp_target_channel = skip_channel_list.pop() @auto_fp16(apply_to=('points', )) def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, list[torch.Tensor]]: Outputs after SA and FP modules. - fp_xyz (list[torch.Tensor]): The coordinates of \ each fp features. - fp_features (list[torch.Tensor]): The features \ from each Feature Propagate Layers. - fp_indices (list[torch.Tensor]): Indices of the \ input points. """ xyz, features = self._split_point_feats(points) batch, num_points = xyz.shape[:2] indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat( batch, 1).long() sa_xyz = [xyz] sa_features = [features] sa_indices = [indices] for i in range(self.num_sa): cur_xyz, cur_features, cur_indices = self.SA_modules[i]( sa_xyz[i], sa_features[i]) sa_xyz.append(cur_xyz) sa_features.append(cur_features) sa_indices.append( torch.gather(sa_indices[-1], 1, cur_indices.long())) fp_xyz = [sa_xyz[-1]] fp_features = [sa_features[-1]] fp_indices = [sa_indices[-1]] for i in range(self.num_fp): fp_features.append(self.FP_modules[i]( sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i], sa_features[self.num_sa - i - 1], fp_features[-1])) fp_xyz.append(sa_xyz[self.num_sa - i - 1]) fp_indices.append(sa_indices[self.num_sa - i - 1]) ret = dict( fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices) return ret ================================================ FILE: mmdet3d/models/backbones/second.py ================================================ from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.runner import load_checkpoint from torch import nn as nn from mmdet.models import BACKBONES @BACKBONES.register_module() class SECOND(nn.Module): """Backbone network for SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (int): Input channels. out_channels (list[int]): Output channels for multi-scale feature maps. layer_nums (list[int]): Number of layers in each stage. layer_strides (list[int]): Strides of each stage. norm_cfg (dict): Config dict of normalization layers. conv_cfg (dict): Config dict of convolutional layers. """ def __init__(self, in_channels=128, out_channels=[128, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)): super(SECOND, self).__init__() assert len(layer_strides) == len(layer_nums) assert len(out_channels) == len(layer_nums) in_filters = [in_channels, *out_channels[:-1]] # note that when stride > 1, conv2d with same padding isn't # equal to pad-conv2d. we should use pad-conv2d. blocks = [] for i, layer_num in enumerate(layer_nums): block = [ build_conv_layer( conv_cfg, in_filters[i], out_channels[i], 3, stride=layer_strides[i], padding=1), build_norm_layer(norm_cfg, out_channels[i])[1], nn.ReLU(inplace=True), ] for j in range(layer_num): block.append( build_conv_layer( conv_cfg, out_channels[i], out_channels[i], 3, padding=1)) block.append(build_norm_layer(norm_cfg, out_channels[i])[1]) block.append(nn.ReLU(inplace=True)) block = nn.Sequential(*block) blocks.append(block) self.blocks = nn.ModuleList(blocks) def init_weights(self, pretrained=None): """Initialize weights of the 2D backbone.""" # Do not initialize the conv layers # to follow the original implementation if isinstance(pretrained, str): from mmdet3d.utils import get_root_logger logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) def forward(self, x): """Forward function. Args: x (torch.Tensor): Input with shape (N, C, H, W). Returns: tuple[torch.Tensor]: Multi-scale features. """ outs = [] for i in range(len(self.blocks)): x = self.blocks[i](x) outs.append(x) return tuple(outs) ================================================ FILE: mmdet3d/models/backbones/swin.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from collections import OrderedDict from copy import deepcopy from typing import Sequence, Iterable, Optional from torch import Tensor import math import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from mmcv.cnn import build_norm_layer, constant_init, build_conv_layer, build_activation_layer, xavier_init from mmcv.runner import BaseModule, _load_checkpoint from mmcv.utils import get_logger from mmdet.models.builder import BACKBONES from mmdet3d.models.utils.drop import build_dropout from mmdet3d.models.utils.transformer import FFN, to_2tuple, ModuleList from mmdet.utils import get_root_logger def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float, b: float) -> Tensor: # Method based on # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf # Modified from # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py def norm_cdf(x): # Computes standard normal cumulative distribution function return (1. + math.erf(x / math.sqrt(2.))) / 2. if (mean < a - 2 * std) or (mean > b + 2 * std): warnings.warn( 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' 'The distribution of values may be incorrect.', stacklevel=2) with torch.no_grad(): # Values are generated by using a truncated uniform distribution and # then using the inverse CDF for the normal distribution. # Get upper and lower cdf values lower = norm_cdf((a - mean) / std) upper = norm_cdf((b - mean) / std) # Uniformly fill tensor with values from [lower, upper], then translate # to [2lower-1, 2upper-1]. tensor.uniform_(2 * lower - 1, 2 * upper - 1) # Use inverse cdf transform for normal distribution to get truncated # standard normal tensor.erfinv_() # Transform to proper mean, std tensor.mul_(std * math.sqrt(2.)) tensor.add_(mean) # Clamp to ensure it's in the proper range tensor.clamp_(min=a, max=b) return tensor def trunc_normal_(tensor: Tensor, mean: float = 0., std: float = 1., a: float = -2., b: float = 2.) -> Tensor: r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for generating the random values works best when :math:`a \leq \text{mean} \leq b`. Modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py Args: tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`. mean (float): the mean of the normal distribution. std (float): the standard deviation of the normal distribution. a (float): the minimum cutoff value. b (float): the maximum cutoff value. """ return _no_grad_trunc_normal_(tensor, mean, std, a, b) def trunc_normal_init(module: nn.Module, mean: float = 0, std: float = 1, a: float = -2, b: float = 2, bias: float = 0) -> None: if hasattr(module, 'weight') and module.weight is not None: trunc_normal_(module.weight, mean, std, a, b) # type: ignore if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) # type: ignore class AdaptivePadding(nn.Module): """Applies padding to input (if needed) so that input can get fully covered by filter you specified. It support two modes "same" and "corner". The "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around input. The "corner" mode would pad zero to bottom right. Args: kernel_size (int | tuple): Size of the kernel: stride (int | tuple): Stride of the filter. Default: 1: dilation (int | tuple): Spacing between kernel elements. Default: 1 padding (str): Support "same" and "corner", "corner" mode would pad zero to bottom right, and "same" mode would pad zero around input. Default: "corner". Example: >>> kernel_size = 16 >>> stride = 16 >>> dilation = 1 >>> input = torch.rand(1, 1, 15, 17) >>> adap_pad = AdaptivePadding( >>> kernel_size=kernel_size, >>> stride=stride, >>> dilation=dilation, >>> padding="corner") >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) >>> input = torch.rand(1, 1, 16, 17) >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) """ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): super(AdaptivePadding, self).__init__() assert padding in ('same', 'corner') kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) padding = to_2tuple(padding) dilation = to_2tuple(dilation) self.padding = padding self.kernel_size = kernel_size self.stride = stride self.dilation = dilation def get_pad_shape(self, input_shape): input_h, input_w = input_shape kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.stride output_h = math.ceil(input_h / stride_h) output_w = math.ceil(input_w / stride_w) pad_h = max((output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) pad_w = max((output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) return pad_h, pad_w def forward(self, x): pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) if pad_h > 0 or pad_w > 0: if self.padding == 'corner': x = F.pad(x, [0, pad_w, 0, pad_h]) elif self.padding == 'same': x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return x class PatchEmbed(BaseModule): """Image to Patch Embedding. We use a conv layer to implement PatchEmbed. Args: in_channels (int): The num of input channels. Default: 3 embed_dims (int): The dimensions of embedding. Default: 768 conv_type (str): The config dict for embedding conv layer type selection. Default: "Conv2d. kernel_size (int): The kernel_size of embedding conv. Default: 16. stride (int): The slide stride of embedding conv. Default: None (Would be set as `kernel_size`). padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int): The dilation rate of embedding conv. Default: 1. bias (bool): Bias of embed conv. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. input_size (int | tuple | None): The size of input, which will be used to calculate the out size. Only work when `dynamic_size` is False. Default: None. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. """ def __init__( self, in_channels=3, embed_dims=768, conv_type='Conv2d', kernel_size=16, stride=16, padding='corner', dilation=1, bias=True, norm_cfg=None, input_size=None, init_cfg=None, ): super(PatchEmbed, self).__init__(init_cfg=init_cfg) self.embed_dims = embed_dims if stride is None: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adap_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of conv padding = 0 else: self.adap_padding = None padding = to_2tuple(padding) self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None if input_size: input_size = to_2tuple(input_size) # `init_out_size` would be used outside to # calculate the num_patches # when `use_abs_pos_embed` outside self.init_input_size = input_size if self.adap_padding: pad_h, pad_w = self.adap_padding.get_pad_shape(input_size) input_h, input_w = input_size input_h = input_h + pad_h input_w = input_w + pad_w input_size = (input_h, input_w) # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1 w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1 self.init_out_size = (h_out, w_out) else: self.init_input_size = None self.init_out_size = None def forward(self, x): """ Args: x (Tensor): Has shape (B, C, H, W). In most case, C is 3. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, out_h * out_w, embed_dims) - out_size (tuple[int]): Spatial shape of x, arrange as (out_h, out_w). """ if self.adap_padding: x = self.adap_padding(x) x = self.projection(x) out_size = (x.shape[2], x.shape[3]) x = x.flatten(2).transpose(1, 2) if self.norm is not None: x = self.norm(x) return x, out_size class PatchMerging(BaseModule): """Merge patch feature map. This layer groups feature map by kernel_size, and applies norm and linear layers to the grouped feature map. Our implementation uses `nn.Unfold` to merge patch, which is about 25% faster than original implementation. Instead, we need to modify pretrained models for compatibility. Args: in_channels (int): The num of input channels. to gets fully covered by filter and stride you specified.. Default: True. out_channels (int): The num of output channels. kernel_size (int | tuple, optional): the kernel size in the unfold layer. Defaults to 2. stride (int | tuple, optional): the stride of the sliding blocks in the unfold layer. Default: None. (Would be set as `kernel_size`) padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int | tuple, optional): dilation parameter in the unfold layer. Default: 1. bias (bool, optional): Whether to add bias in linear layer or not. Defaults: False. norm_cfg (dict, optional): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, in_channels, out_channels, kernel_size=2, stride=None, padding='corner', dilation=1, bias=False, norm_cfg=dict(type='LN'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels if stride: stride = stride else: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adap_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of unfold padding = 0 else: self.adap_padding = None padding = to_2tuple(padding) self.sampler = nn.Unfold( kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride) sample_dim = kernel_size[0] * kernel_size[1] * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) def forward(self, x, input_size): """ Args: x (Tensor): Has shape (B, H*W, C_in). input_size (tuple[int]): The spatial shape of x, arrange as (H, W). Default: None. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) - out_size (tuple[int]): Spatial shape of x, arrange as (Merged_H, Merged_W). """ B, L, C = x.shape assert isinstance(input_size, Sequence), f'Expect ' \ f'input_size is ' \ f'`Sequence` ' \ f'but get {input_size}' H, W = input_size assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility if self.adap_padding: x = self.adap_padding(x) H, W = x.shape[-2:] x = self.sampler(x) # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1) // self.sampler.stride[0] + 1 out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1) // self.sampler.stride[1] + 1 output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C x = self.norm(x) if self.norm else x x = self.reduction(x) return x, output_size def swin_converter(ckpt): new_ckpt = OrderedDict() def correct_unfold_reduction_order(x): out_channel, in_channel = x.shape x = x.reshape(out_channel, 4, in_channel // 4) x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel) return x def correct_unfold_norm_order(x): in_channel = x.shape[0] x = x.reshape(4, in_channel // 4) x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel) return x for k, v in ckpt.items(): if k.startswith('head'): continue elif k.startswith('layers'): new_v = v if 'attn.' in k: new_k = k.replace('attn.', 'attn.w_msa.') elif 'mlp.' in k: if 'mlp.fc1.' in k: new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.') elif 'mlp.fc2.' in k: new_k = k.replace('mlp.fc2.', 'ffn.layers.1.') else: new_k = k.replace('mlp.', 'ffn.') elif 'downsample' in k: new_k = k if 'reduction.' in k: new_v = correct_unfold_reduction_order(v) elif 'norm.' in k: new_v = correct_unfold_norm_order(v) else: new_k = k new_k = new_k.replace('layers', 'stages', 1) elif k.startswith('patch_embed'): new_v = v if 'proj' in k: new_k = k.replace('proj', 'projection') else: new_k = k else: new_v = v new_k = k new_ckpt['backbone.' + new_k] = new_v return new_ckpt class WindowMSA(BaseModule): """Window based multi-head self-attention (W-MSA) module with relative position bias. Args: embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (tuple[int]): The height and width of the window. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. init_cfg (dict | None, optional): The Config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, qkv_bias=True, qk_scale=None, attn_drop_rate=0., proj_drop_rate=0., init_cfg=None): super().__init__() self.embed_dims = embed_dims self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_embed_dims = embed_dims // num_heads self.scale = qk_scale or head_embed_dims**-0.5 self.init_cfg = init_cfg # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # About 2x faster than original impl Wh, Ww = self.window_size rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) rel_position_index = rel_index_coords + rel_index_coords.T rel_position_index = rel_position_index.flip(1).contiguous() self.register_buffer('relative_position_index', rel_position_index) self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop_rate) self.proj = nn.Linear(embed_dims, embed_dims) self.proj_drop = nn.Dropout(proj_drop_rate) self.softmax = nn.Softmax(dim=-1) def init_weights(self): trunc_normal_(self.relative_position_bias_table, std=0.02) def forward(self, x, mask=None): """ Args: x (tensor): input features with shape of (num_windows*B, N, C) mask (tensor | None, Optional): mask with shape of (num_windows, Wh*Ww, Wh*Ww), value should be between (-inf, 0]. """ B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) # make torchscript happy (cannot use tensor as tuple) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x @staticmethod def double_step_seq(step1, len1, step2, len2): seq1 = torch.arange(0, step1 * len1, step1) seq2 = torch.arange(0, step2 * len2, step2) return (seq1[:, None] + seq2[None, :]).reshape(1, -1) class ShiftWindowMSA(BaseModule): """Shifted Window Multihead Self-Attention Module. Args: embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): The height and width of the window. shift_size (int, optional): The shift step of each window towards right-bottom. If zero, act as regular window-msa. Defaults to 0. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Defaults: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Defaults: 0. proj_drop_rate (float, optional): Dropout ratio of output. Defaults: 0. dropout_layer (dict, optional): The dropout_layer used before output. Defaults: dict(type='DropPath', drop_prob=0.). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop_rate=0, proj_drop_rate=0, dropout_layer=dict(type='DropPath', drop_prob=0.), init_cfg=None): super().__init__(init_cfg) self.window_size = window_size self.shift_size = shift_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=to_2tuple(window_size), qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=proj_drop_rate, init_cfg=None) self.drop = build_dropout(dropout_layer) def forward(self, query, hw_shape): B, L, C = query.shape H, W = hw_shape assert L == H * W, 'input feature has wrong size' query = query.view(B, H, W, C) # pad feature maps to multiples of window size pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) H_pad, W_pad = query.shape[1], query.shape[2] # cyclic shift if self.shift_size > 0: shifted_query = torch.roll( query, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) # calculate attention mask for SW-MSA img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device) h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = self.window_partition(img_mask) mask_windows = mask_windows.view( -1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0)) else: shifted_query = query attn_mask = None # nW*B, window_size, window_size, C query_windows = self.window_partition(shifted_query) # nW*B, window_size*window_size, C query_windows = query_windows.view(-1, self.window_size**2, C) # W-MSA/SW-MSA (nW*B, window_size*window_size, C) attn_windows = self.w_msa(query_windows, mask=attn_mask) # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) # B H' W' C shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) # reverse cyclic shift if self.shift_size > 0: x = torch.roll( shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) x = self.drop(x) return x def window_reverse(self, windows, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ window_size = self.window_size B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x def window_partition(self, x): """ Args: x: (B, H, W, C) Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape window_size = self.window_size x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() windows = windows.view(-1, window_size, window_size, C) return windows class SwinBlock(BaseModule): """" Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. window_size (int, optional): The local window scale. Default: 7. shift (bool, optional): whether to shift window or not. Default False. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float, optional): Stochastic depth rate. Default: 0. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, window_size=7, shift=False, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, init_cfg=None): super(SwinBlock, self).__init__() self.init_cfg = init_cfg self.with_cp = with_cp self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = ShiftWindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=window_size, shift_size=window_size // 2 if shift else 0, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), init_cfg=None) self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] self.ffn = FFN( embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=2, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg, add_identity=True, init_cfg=None) def forward(self, x, hw_shape): def _inner_forward(x): identity = x x = self.norm1(x) x = self.attn(x, hw_shape) x = x + identity identity = x x = self.norm2(x) x = self.ffn(x, identity=identity) return x if self.with_cp and x.requires_grad: x = cp.checkpoint(_inner_forward, x) else: x = _inner_forward(x) return x class SwinBlockSequence(BaseModule): """Implements one stage in Swin Transformer. Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. depth (int): The number of blocks in this stage. window_size (int, optional): The local window scale. Default: 7. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float | list[float], optional): Stochastic depth rate. Default: 0. downsample (BaseModule | None, optional): The downsample operation module. Default: None. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, depth, window_size=7, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., downsample=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(drop_path_rate, list): drop_path_rates = drop_path_rate assert len(drop_path_rates) == depth else: drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] self.blocks = ModuleList() for i in range(depth): block = SwinBlock( embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, window_size=window_size, shift=False if i % 2 == 0 else True, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rates[i], act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.blocks.append(block) self.downsample = downsample def forward(self, x, hw_shape): for block in self.blocks: x = block(x, hw_shape) if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape @BACKBONES.register_module() class SwinTransformer(BaseModule): """ Swin Transformer A PyTorch implement of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/abs/2103.14030 Inspiration from https://github.com/microsoft/Swin-Transformer Args: pretrain_img_size (int | tuple[int]): The size of input image when pretrain. Defaults: 224. in_channels (int): The num of input channels. Defaults: 3. embed_dims (int): The feature dimension. Default: 96. patch_size (int | tuple[int]): Patch size. Default: 4. window_size (int): Window size. Default: 7. mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. Default: 4. depths (tuple[int]): Depths of each Swin Transformer stage. Default: (2, 2, 6, 2). num_heads (tuple[int]): Parallel attention heads of each Swin Transformer stage. Default: (3, 6, 12, 24). strides (tuple[int]): The patch merging or patch embedding stride of each Swin Transformer stage. (In swin, we set kernel size equal to stride.) Default: (4, 2, 2, 2). out_indices (tuple[int]): Output from which stages. Default: (0, 1, 2, 3). qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. patch_norm (bool): If add a norm layer for patch embed and patch merging. Default: True. drop_rate (float): Dropout rate. Defaults: 0. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults: False. act_cfg (dict): Config dict for activation layer. Default: dict(type='GELU'). norm_cfg (dict): Config dict for normalization layer at output of backone. Defaults: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. pretrained (str, optional): model pretrained path. Default: None. convert_weights (bool): The flag indicates whether the pre-trained model is from the original repo. We may need to convert some keys to make it compatible. Default: False. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). Default: -1 (-1 means not freezing any parameters). init_cfg (dict, optional): The Config for initialization. Defaults to None. """ def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, pretrained=None, convert_weights=False, frozen_stages=-1, init_cfg=None): self.convert_weights = convert_weights self.frozen_stages = frozen_stages if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be specified at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') super(SwinTransformer, self).__init__(init_cfg=init_cfg) num_layers = len(depths) self.out_indices = out_indices self.use_abs_pos_embed = use_abs_pos_embed assert strides[0] == patch_size, 'Use non-overlapping patch embed.' self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) if self.use_abs_pos_embed: patch_row = pretrain_img_size[0] // patch_size patch_col = pretrain_img_size[1] // patch_size self.absolute_pos_embed = nn.Parameter( torch.zeros((1, embed_dims, patch_row, patch_col))) self.drop_after_pos = nn.Dropout(p=drop_rate) # set stochastic depth decay rule total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] self.stages = ModuleList() in_channels = embed_dims for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinBlockSequence( embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.stages.append(stage) if downsample: in_channels = downsample.out_channels self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] layer_name = f'norm{i}' self.add_module(layer_name, layer) def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.use_abs_pos_embed: self.absolute_pos_embed.requires_grad = False self.drop_after_pos.eval() for i in range(1, self.frozen_stages + 1): if (i - 1) in self.out_indices: norm_layer = getattr(self, f'norm{i-1}') norm_layer.eval() for param in norm_layer.parameters(): param.requires_grad = False m = self.stages[i - 1] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): logger = get_root_logger() if pretrained is None: logger.warn(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') if self.use_abs_pos_embed: trunc_normal_(self.absolute_pos_embed, std=0.02) for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m, std=.02, bias=0.) elif isinstance(m, nn.LayerNorm): constant_init(m, 1.0) else: # assert 'checkpoint' in self.init_cfg, f'Only support ' \ # f'specify `Pretrained` in ' \ # f'`init_cfg` in ' \ # f'{self.__class__.__name__} ' ckpt = _load_checkpoint(pretrained, logger=logger, map_location='cpu') if 'state_dict' in ckpt: _state_dict = ckpt['state_dict'] elif 'model' in ckpt: _state_dict = ckpt['model'] else: _state_dict = ckpt if self.convert_weights: # supported loading weight from original repo, _state_dict = swin_converter(_state_dict) state_dict = OrderedDict() for k, v in _state_dict.items(): if k.startswith('backbone.'): state_dict[k[9:]] = v # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = self.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H * W: logger.warning('Error in loading absolute_pos_embed, pass') else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view( N2, H, W, C2).permute(0, 3, 1, 2).contiguous() # interpolate position bias table if needed relative_position_bias_table_keys = [ k for k in state_dict.keys() if 'relative_position_bias_table' in k ] for table_key in relative_position_bias_table_keys: import pdb pdb.set_trace() table_pretrained = state_dict[table_key] table_current = self.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f'Error in loading {table_key}, pass') elif L1 != L2: S1 = int(L1**0.5) S2 = int(L2**0.5) table_pretrained_resized = F.interpolate( table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view( nH2, L2).permute(1, 0).contiguous() # load state_dict self.load_state_dict(state_dict, False) def forward(self, x): x, hw_shape = self.patch_embed(x) if self.use_abs_pos_embed: h, w = self.absolute_pos_embed.shape[1:3] if hw_shape[0] != h or hw_shape[1] != w: absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=hw_shape, mode='bicubic', align_corners=False).flatten(2).transpose(1, 2) else: absolute_pos_embed = self.absolute_pos_embed.flatten( 2).transpose(1, 2) x = x + absolute_pos_embed x = self.drop_after_pos(x) outs = [] for i, stage in enumerate(self.stages): x, hw_shape, out, out_hw_shape = stage(x, hw_shape) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(out) out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) return outs ================================================ FILE: mmdet3d/models/builder.py ================================================ import warnings from mmdet.models.builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS, ROI_EXTRACTORS, SHARED_HEADS, build) from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS def build_backbone(cfg): """Build backbone.""" return build(cfg, BACKBONES) def build_neck(cfg): """Build neck.""" return build(cfg, NECKS) def build_roi_extractor(cfg): """Build RoI feature extractor.""" return build(cfg, ROI_EXTRACTORS) def build_shared_head(cfg): """Build shared head of detector.""" return build(cfg, SHARED_HEADS) def build_head(cfg): """Build head.""" return build(cfg, HEADS) def build_loss(cfg): """Build loss function.""" return build(cfg, LOSSES) def build_detector(cfg, train_cfg=None, test_cfg=None): """Build detector.""" if train_cfg is not None or test_cfg is not None: warnings.warn( 'train_cfg and test_cfg is deprecated, ' 'please specify them in model', UserWarning) assert cfg.get('train_cfg') is None or train_cfg is None, \ 'train_cfg specified in both outer field and model field ' assert cfg.get('test_cfg') is None or test_cfg is None, \ 'test_cfg specified in both outer field and model field ' return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) def build_voxel_encoder(cfg): """Build voxel encoder.""" return build(cfg, VOXEL_ENCODERS) def build_middle_encoder(cfg): """Build middle level encoder.""" return build(cfg, MIDDLE_ENCODERS) def build_fusion_layer(cfg): """Build fusion layer.""" return build(cfg, FUSION_LAYERS) ================================================ FILE: mmdet3d/models/dense_heads/__init__.py ================================================ from .anchor3d_head import Anchor3DHead from .base_conv_bbox_head import BaseConvBboxHead from .centerpoint_head import CenterHead from .free_anchor3d_head import FreeAnchor3DHead from .parta2_rpn_head import PartA2RPNHead from .shape_aware_head import ShapeAwareHead from .ssd_3d_head import SSD3DHead from .vote_head import VoteHead from .transfusion_head import TransFusionHead from .sparsefusion_head_deform import SparseFusionHead2D_Deform __all__ = [ 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'TransFusionHead', 'SparseFusionHead2D_Deform' ] ================================================ FILE: mmdet3d/models/dense_heads/anchor3d_head.py ================================================ import numpy as np import torch from mmcv.cnn import bias_init_with_prob, normal_init from mmcv.runner import force_fp32 from torch import nn as nn from mmdet3d.core import (PseudoSampler, box3d_multiclass_nms, limit_period, xywhr2xyxyr) from mmdet.core import (build_anchor_generator, build_assigner, build_bbox_coder, build_sampler, multi_apply) from mmdet.models import HEADS from ..builder import build_loss from .train_mixins import AnchorTrainMixin @HEADS.register_module() class Anchor3DHead(nn.Module, AnchorTrainMixin): """Anchor head for SECOND/PointPillars/MVXNet/PartA2. Args: num_classes (int): Number of classes. in_channels (int): Number of channels in the input feature map. train_cfg (dict): Train configs. test_cfg (dict): Test configs. feat_channels (int): Number of channels of the feature map. use_direction_classifier (bool): Whether to add a direction classifier. anchor_generator(dict): Config dict of anchor generator. assigner_per_size (bool): Whether to do assignment for each separate anchor size. assign_per_class (bool): Whether to do assignment for each class. diff_rad_by_sin (bool): Whether to change the difference into sin difference for box regression loss. dir_offset (float | int): The offset of BEV rotation angles. (TODO: may be moved into box coder) dir_limit_offset (float | int): The limited range of BEV rotation angles. (TODO: may be moved into box coder) bbox_coder (dict): Config dict of box coders. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_dir (dict): Config of direction classifier loss. """ def __init__(self, num_classes, in_channels, train_cfg, test_cfg, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, dir_offset=0, dir_limit_offset=1, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)): super().__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.diff_rad_by_sin = diff_rad_by_sin self.use_direction_classifier = use_direction_classifier self.train_cfg = train_cfg self.test_cfg = test_cfg self.assigner_per_size = assigner_per_size self.assign_per_class = assign_per_class self.dir_offset = dir_offset self.dir_limit_offset = dir_limit_offset self.fp16_enabled = False # build anchor generator self.anchor_generator = build_anchor_generator(anchor_generator) # In 3D detection, the anchor stride is connected with anchor size self.num_anchors = self.anchor_generator.num_base_anchors # build box coder self.bbox_coder = build_bbox_coder(bbox_coder) self.box_code_size = self.bbox_coder.code_size # build loss function self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC'] if not self.use_sigmoid_cls: self.num_classes += 1 self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_dir = build_loss(loss_dir) self.fp16_enabled = False self._init_layers() self._init_assigner_sampler() def _init_assigner_sampler(self): """Initialize the target assigner and sampler of the head.""" if self.train_cfg is None: return if self.sampling: self.bbox_sampler = build_sampler(self.train_cfg.sampler) else: self.bbox_sampler = PseudoSampler() if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] def _init_layers(self): """Initialize neural network layers of the head.""" self.cls_out_channels = self.num_anchors * self.num_classes self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1) self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * self.box_code_size, 1) if self.use_direction_classifier: self.conv_dir_cls = nn.Conv2d(self.feat_channels, self.num_anchors * 2, 1) def init_weights(self): """Initialize the weights of head.""" bias_cls = bias_init_with_prob(0.01) normal_init(self.conv_cls, std=0.01, bias=bias_cls) normal_init(self.conv_reg, std=0.01) def forward_single(self, x): """Forward function on a single-scale feature map. Args: x (torch.Tensor): Input features. Returns: tuple[torch.Tensor]: Contain score of each class, bbox \ regression and direction classification predictions. """ cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = self.conv_dir_cls(x) return cls_score, bbox_pred, dir_cls_preds def forward(self, feats): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple[list[torch.Tensor]]: Multi-level class score, bbox \ and direction predictions. """ return multi_apply(self.forward_single, feats) def get_anchors(self, featmap_sizes, input_metas, device='cuda'): """Get anchors according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. input_metas (list[dict]): contain pcd and img's meta info. device (str): device of current module. Returns: list[list[torch.Tensor]]: Anchors of each image, valid flags \ of each image. """ num_imgs = len(input_metas) # since feature map sizes of all images are the same, we only compute # anchors for one time multi_level_anchors = self.anchor_generator.grid_anchors( featmap_sizes, device=device) anchor_list = [multi_level_anchors for _ in range(num_imgs)] return anchor_list def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, num_total_samples): """Calculate loss of Single-level results. Args: cls_score (torch.Tensor): Class score in single-level. bbox_pred (torch.Tensor): Bbox prediction in single-level. dir_cls_preds (torch.Tensor): Predictions of direction class in single-level. labels (torch.Tensor): Labels of class. label_weights (torch.Tensor): Weights of class loss. bbox_targets (torch.Tensor): Targets of bbox predictions. bbox_weights (torch.Tensor): Weights of bbox loss. dir_targets (torch.Tensor): Targets of direction predictions. dir_weights (torch.Tensor): Weights of direction loss. num_total_samples (int): The number of valid samples. Returns: tuple[torch.Tensor]: Losses of class, bbox \ and direction, respectively. """ # classification loss if num_total_samples is None: num_total_samples = int(cls_score.shape[0]) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes) assert labels.max().item() <= self.num_classes loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples) # regression loss bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, self.box_code_size) bbox_targets = bbox_targets.reshape(-1, self.box_code_size) bbox_weights = bbox_weights.reshape(-1, self.box_code_size) bg_class_ind = self.num_classes pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero( as_tuple=False).reshape(-1) num_pos = len(pos_inds) pos_bbox_pred = bbox_pred[pos_inds] pos_bbox_targets = bbox_targets[pos_inds] pos_bbox_weights = bbox_weights[pos_inds] # dir loss if self.use_direction_classifier: dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2) dir_targets = dir_targets.reshape(-1) dir_weights = dir_weights.reshape(-1) pos_dir_cls_preds = dir_cls_preds[pos_inds] pos_dir_targets = dir_targets[pos_inds] pos_dir_weights = dir_weights[pos_inds] if num_pos > 0: code_weight = self.train_cfg.get('code_weight', None) if code_weight: pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor( code_weight) if self.diff_rad_by_sin: pos_bbox_pred, pos_bbox_targets = self.add_sin_difference( pos_bbox_pred, pos_bbox_targets) loss_bbox = self.loss_bbox( pos_bbox_pred, pos_bbox_targets, pos_bbox_weights, avg_factor=num_total_samples) # direction classification loss loss_dir = None if self.use_direction_classifier: loss_dir = self.loss_dir( pos_dir_cls_preds, pos_dir_targets, pos_dir_weights, avg_factor=num_total_samples) else: loss_bbox = pos_bbox_pred.sum() if self.use_direction_classifier: loss_dir = pos_dir_cls_preds.sum() return loss_cls, loss_bbox, loss_dir @staticmethod def add_sin_difference(boxes1, boxes2): """Convert the rotation difference to difference in sine function. Args: boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. Returns: tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \ dimensions are changed. """ rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( boxes2[..., 6:7]) rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[..., 6:7]) boxes1 = torch.cat( [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1) boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]], dim=-1) return boxes1, boxes2 @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and \ direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - loss_dir (list[torch.Tensor]): Direction classification \ losses. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels device = cls_scores[0].device anchor_list = self.get_anchors( featmap_sizes, input_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.anchor_target_3d( anchor_list, gt_bboxes, input_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, num_classes=self.num_classes, label_channels=label_channels, sampling=self.sampling) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = ( num_total_pos + num_total_neg if self.sampling else num_total_pos) # num_total_samples = None losses_cls, losses_bbox, losses_dir = multi_apply( self.loss_single, cls_scores, bbox_preds, dir_cls_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_samples=num_total_samples) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, input_metas, cfg=None, rescale=False): """Get bboxes of anchor head. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. cfg (None | :obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): Whether th rescale bbox. Returns: list[tuple]: Prediction resultes of batches. """ assert len(cls_scores) == len(bbox_preds) assert len(cls_scores) == len(dir_cls_preds) num_levels = len(cls_scores) featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] device = cls_scores[0].device mlvl_anchors = self.anchor_generator.grid_anchors( featmap_sizes, device=device) mlvl_anchors = [ anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors ] result_list = [] for img_id in range(len(input_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] input_meta = input_metas[img_id] proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, dir_cls_pred_list, mlvl_anchors, input_meta, cfg, rescale) result_list.append(proposals) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg=None, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (None | :obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: tuple: Contain predictions of single batch. - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores (torch.Tensor): Class score of each bbox. - labels (torch.Tensor): Label of each bbox. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, self.box_code_size) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_scores = torch.cat(mlvl_scores) mlvl_dir_scores = torch.cat(mlvl_dir_scores) if self.use_sigmoid_cls: # Add a dummy background class to the front when using sigmoid padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) score_thr = cfg.get('score_thr', 0) results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, cfg.max_num, cfg, mlvl_dir_scores) bboxes, scores, labels, dir_scores = results if bboxes.shape[0] > 0: dir_rot = limit_period(bboxes[..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores.to(bboxes.dtype)) bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size) return bboxes, scores, labels ================================================ FILE: mmdet3d/models/dense_heads/base_conv_bbox_head.py ================================================ from mmcv.cnn import ConvModule from mmcv.cnn.bricks import build_conv_layer from torch import nn as nn from mmdet.models.builder import HEADS @HEADS.register_module() class BaseConvBboxHead(nn.Module): r"""More general bbox head, with shared conv layers and two optional separated branches. .. code-block:: none /-> cls convs -> cls_score shared convs \-> reg convs -> bbox_pred """ def __init__(self, in_channels=0, shared_conv_channels=(), cls_conv_channels=(), num_cls_out_channels=0, reg_conv_channels=(), num_reg_out_channels=0, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), bias='auto', *args, **kwargs): super(BaseConvBboxHead, self).__init__(*args, **kwargs) assert in_channels > 0 assert num_cls_out_channels > 0 assert num_reg_out_channels > 0 self.in_channels = in_channels self.shared_conv_channels = shared_conv_channels self.cls_conv_channels = cls_conv_channels self.num_cls_out_channels = num_cls_out_channels self.reg_conv_channels = reg_conv_channels self.num_reg_out_channels = num_reg_out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.bias = bias # add shared convs if len(self.shared_conv_channels) > 0: self.shared_convs = self._add_conv_branch( self.in_channels, self.shared_conv_channels) out_channels = self.shared_conv_channels[-1] else: out_channels = self.in_channels # add cls specific branch prev_channel = out_channels if len(self.cls_conv_channels) > 0: self.cls_convs = self._add_conv_branch(prev_channel, self.cls_conv_channels) prev_channel = self.cls_conv_channels[-1] self.conv_cls = build_conv_layer( conv_cfg, in_channels=prev_channel, out_channels=num_cls_out_channels, kernel_size=1) # add reg specific branch prev_channel = out_channels if len(self.reg_conv_channels) > 0: self.reg_convs = self._add_conv_branch(prev_channel, self.reg_conv_channels) prev_channel = self.reg_conv_channels[-1] self.conv_reg = build_conv_layer( conv_cfg, in_channels=prev_channel, out_channels=num_reg_out_channels, kernel_size=1) def _add_conv_branch(self, in_channels, conv_channels): """Add shared or separable branch.""" conv_spec = [in_channels] + list(conv_channels) # add branch specific conv layers conv_layers = nn.Sequential() for i in range(len(conv_spec) - 1): conv_layers.add_module( f'layer{i}', ConvModule( conv_spec[i], conv_spec[i + 1], kernel_size=1, padding=0, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=self.bias, inplace=True)) return conv_layers def init_weights(self): # conv layers are already initialized by ConvModule pass def forward(self, feats): """Forward. Args: feats (Tensor): Input features Returns: Tensor: Class scores predictions Tensor: Regression predictions """ # shared part if len(self.shared_conv_channels) > 0: x = self.shared_convs(feats) # separate branches x_cls = x x_reg = x if len(self.cls_conv_channels) > 0: x_cls = self.cls_convs(x_cls) cls_score = self.conv_cls(x_cls) if len(self.reg_conv_channels) > 0: x_reg = self.reg_convs(x_reg) bbox_pred = self.conv_reg(x_reg) return cls_score, bbox_pred ================================================ FILE: mmdet3d/models/dense_heads/centerpoint_head.py ================================================ import copy import numpy as np import torch from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init from mmcv.runner import force_fp32 from torch import nn from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr) from mmdet3d.models import builder from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.models.utils import clip_sigmoid from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu from mmdet.core import build_bbox_coder, multi_apply @HEADS.register_module() class SeparateHead(nn.Module): """SeparateHead for CenterHead. Args: in_channels (int): Input channels for conv_layer. heads (dict): Conv information. head_conv (int): Output channels. Default: 64. final_kernal (int): Kernal size for the last conv layer. Deafult: 1. init_bias (float): Initial bias. Default: -2.19. conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). bias (str): Type of bias. Default: 'auto'. """ def __init__(self, in_channels, heads, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', **kwargs): super(SeparateHead, self).__init__() self.heads = heads self.init_bias = init_bias for head in self.heads: classes, num_conv = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) c_in = head_conv conv_layers.append( build_conv_layer( conv_cfg, head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) def init_weights(self): """Initialize weights.""" for head in self.heads: if head == 'heatmap': self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) else: for m in self.__getattr__(head).modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ ret_dict = dict() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) return ret_dict @HEADS.register_module() class DCNSeparateHead(nn.Module): r"""DCNSeparateHead for CenterHead. .. code-block:: none /-----> DCN for heatmap task -----> heatmap task. feature \-----> DCN for regression tasks -----> regression tasks Args: in_channels (int): Input channels for conv_layer. heads (dict): Conv information. dcn_config (dict): Config of dcn layer. num_cls (int): Output channels. Default: 64. final_kernal (int): Kernal size for the last conv layer. Deafult: 1. init_bias (float): Initial bias. Default: -2.19. conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). bias (str): Type of bias. Default: 'auto'. """ # noqa: W605 def __init__(self, in_channels, num_cls, heads, dcn_config, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', **kwargs): super(DCNSeparateHead, self).__init__() if 'heatmap' in heads: heads.pop('heatmap') # feature adaptation with dcn # use separate features for classification / regression self.feature_adapt_cls = build_conv_layer(dcn_config) self.feature_adapt_reg = build_conv_layer(dcn_config) # heatmap prediction head cls_head = [ ConvModule( in_channels, head_conv, kernel_size=3, padding=1, conv_cfg=conv_cfg, bias=bias, norm_cfg=norm_cfg), build_conv_layer( conv_cfg, head_conv, num_cls, kernel_size=3, stride=1, padding=1, bias=bias) ] self.cls_head = nn.Sequential(*cls_head) self.init_bias = init_bias # other regression target self.task_head = SeparateHead( in_channels, heads, head_conv=head_conv, final_kernel=final_kernel, bias=bias) def init_weights(self): """Initialize weights.""" self.cls_head[-1].bias.data.fill_(self.init_bias) self.task_head.init_weights() def forward(self, x): """Forward function for DCNSepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ center_feat = self.feature_adapt_cls(x) reg_feat = self.feature_adapt_reg(x) cls_score = self.cls_head(center_feat) ret = self.task_head(reg_feat) ret['heatmap'] = cls_score return ret @HEADS.register_module() class CenterHead(nn.Module): """CenterHead for CenterPoint. Args: mode (str): Mode of the head. Default: '3d'. in_channels (list[int] | int): Channels of the input feature map. Default: [128]. tasks (list[dict]): Task information including class number and class names. Default: None. dataset (str): Name of the dataset. Default: 'nuscenes'. weight (float): Weight for location loss. Default: 0.25. code_weights (list[int]): Code weights for location loss. Default: []. common_heads (dict): Conv information for common heads. Default: dict(). loss_cls (dict): Config of classification loss function. Default: dict(type='GaussianFocalLoss', reduction='mean'). loss_bbox (dict): Config of regression loss function. Default: dict(type='L1Loss', reduction='none'). separate_head (dict): Config of separate head. Default: dict( type='SeparateHead', init_bias=-2.19, final_kernel=3) share_conv_channel (int): Output channels for share_conv_layer. Default: 64. num_heatmap_convs (int): Number of conv layers for heatmap conv layer. Default: 2. conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). bias (str): Type of bias. Default: 'auto'. """ def __init__(self, in_channels=[128], tasks=None, train_cfg=None, test_cfg=None, bbox_coder=None, common_heads=dict(), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict( type='L1Loss', reduction='none', loss_weight=0.25), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), share_conv_channel=64, num_heatmap_convs=2, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', norm_bbox=True): super(CenterHead, self).__init__() num_classes = [len(t['class_names']) for t in tasks] self.class_names = [t['class_names'] for t in tasks] self.train_cfg = train_cfg self.test_cfg = test_cfg self.in_channels = in_channels self.num_classes = num_classes self.norm_bbox = norm_bbox self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.bbox_coder = build_bbox_coder(bbox_coder) self.num_anchor_per_locs = [n for n in num_classes] self.fp16_enabled = False # a shared convolution self.shared_conv = ConvModule( in_channels, share_conv_channel, kernel_size=3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) self.task_heads = nn.ModuleList() for num_cls in num_classes: heads = copy.deepcopy(common_heads) heads.update(dict(heatmap=(num_cls, num_heatmap_convs))) separate_head.update( in_channels=share_conv_channel, heads=heads, num_cls=num_cls) self.task_heads.append(builder.build_head(separate_head)) def init_weights(self): """Initialize weights.""" for task_head in self.task_heads: task_head.init_weights() def forward_single(self, x): """Forward function for CenterPoint. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: list[dict]: Output results for tasks. """ ret_dicts = [] x = self.shared_conv(x) for task in self.task_heads: ret_dicts.append(task(x)) return ret_dicts def forward(self, feats): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple(list[dict]): Output results for tasks. """ return multi_apply(self.forward_single, feats) def _gather_feat(self, feat, ind, mask=None): """Gather feature map. Given feature map and index, return indexed feature map. Args: feat (torch.tensor): Feature map with the shape of [B, H*W, 10]. ind (torch.Tensor): Index of the ground truth boxes with the shape of [B, max_obj]. mask (torch.Tensor): Mask of the feature map with the shape of [B, max_obj]. Default: None. Returns: torch.Tensor: Feature map after gathering with the shape of [B, max_obj, 10]. """ dim = feat.size(2) ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) feat = feat.gather(1, ind) if mask is not None: mask = mask.unsqueeze(2).expand_as(feat) feat = feat[mask] feat = feat.view(-1, dim) return feat def get_targets(self, gt_bboxes_3d, gt_labels_3d): """Generate targets. Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. Returns: Returns: tuple[list[torch.Tensor]]: Tuple of target including \ the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the \ position of the valid boxes. - list[torch.Tensor]: Masks indicating which \ boxes are valid. """ heatmaps, anno_boxes, inds, masks = multi_apply( self.get_targets_single, gt_bboxes_3d, gt_labels_3d) # transpose heatmaps, because the dimension of tensors in each task is # different, we have to use numpy instead of torch to do the transpose. heatmaps = np.array(heatmaps).transpose(1, 0).tolist() heatmaps = [torch.stack(hms_) for hms_ in heatmaps] # transpose anno_boxes anno_boxes = np.array(anno_boxes).transpose(1, 0).tolist() anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes] # transpose inds inds = np.array(inds).transpose(1, 0).tolist() inds = [torch.stack(inds_) for inds_ in inds] # transpose inds masks = np.array(masks).transpose(1, 0).tolist() masks = [torch.stack(masks_) for masks_ in masks] return heatmaps, anno_boxes, inds, masks def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. Returns: tuple[list[torch.Tensor]]: Tuple of target including \ the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the position \ of the valid boxes. - list[torch.Tensor]: Masks indicating which boxes \ are valid. """ device = gt_labels_3d.device gt_bboxes_3d = torch.cat( (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1).to(device) max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg'] grid_size = torch.tensor(self.train_cfg['grid_size']) pc_range = torch.tensor(self.train_cfg['point_cloud_range']) voxel_size = torch.tensor(self.train_cfg['voxel_size']) feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # reorganize the gt_dict by tasks task_masks = [] flag = 0 for class_name in self.class_names: task_masks.append([ torch.where(gt_labels_3d == class_name.index(i) + flag) for i in class_name ]) flag += len(class_name) task_boxes = [] task_classes = [] flag2 = 0 for idx, mask in enumerate(task_masks): task_box = [] task_class = [] for m in mask: task_box.append(gt_bboxes_3d[m]) # 0 is background for each task, so we need to add 1 here. task_class.append(gt_labels_3d[m] + 1 - flag2) task_boxes.append(torch.cat(task_box, axis=0).to(device)) task_classes.append(torch.cat(task_class).long().to(device)) flag2 += len(mask) draw_gaussian = draw_heatmap_gaussian heatmaps, anno_boxes, inds, masks = [], [], [], [] for idx, task_head in enumerate(self.task_heads): heatmap = gt_bboxes_3d.new_zeros( (len(self.class_names[idx]), feature_map_size[1], feature_map_size[0])) anno_box = gt_bboxes_3d.new_zeros((max_objs, 10), dtype=torch.float32) ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64) mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8) num_objs = min(task_boxes[idx].shape[0], max_objs) for k in range(num_objs): cls_id = task_classes[idx][k] - 1 width = task_boxes[idx][k][3] length = task_boxes[idx][k][4] width = width / voxel_size[0] / self.train_cfg[ 'out_size_factor'] length = length / voxel_size[1] / self.train_cfg[ 'out_size_factor'] if width > 0 and length > 0: radius = gaussian_radius( (length, width), min_overlap=self.train_cfg['gaussian_overlap']) radius = max(self.train_cfg['min_radius'], int(radius)) # be really careful for the coordinate system of # your box annotation. x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][ 1], task_boxes[idx][k][2] coor_x = ( x - pc_range[0] ) / voxel_size[0] / self.train_cfg['out_size_factor'] coor_y = ( y - pc_range[1] ) / voxel_size[1] / self.train_cfg['out_size_factor'] center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) center_int = center.to(torch.int32) # throw out not in range objects to avoid out of array # area when creating the heatmap if not (0 <= center_int[0] < feature_map_size[0] and 0 <= center_int[1] < feature_map_size[1]): continue draw_gaussian(heatmap[cls_id], center_int, radius) new_idx = k x, y = center_int[0], center_int[1] assert (y * feature_map_size[0] + x < feature_map_size[0] * feature_map_size[1]) ind[new_idx] = y * feature_map_size[0] + x mask[new_idx] = 1 # TODO: support other outdoor dataset vx, vy = task_boxes[idx][k][7:] rot = task_boxes[idx][k][6] box_dim = task_boxes[idx][k][3:6] if self.norm_bbox: box_dim = box_dim.log() anno_box[new_idx] = torch.cat([ center - torch.tensor([x, y], device=device), z.unsqueeze(0), box_dim, torch.sin(rot).unsqueeze(0), torch.cos(rot).unsqueeze(0), vx.unsqueeze(0), vy.unsqueeze(0) ]) heatmaps.append(heatmap) anno_boxes.append(anno_box) masks.append(mask) inds.append(ind) return heatmaps, anno_boxes, inds, masks @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): """Loss function for CenterHead. Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. preds_dicts (dict): Output of forward function. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ heatmaps, anno_boxes, inds, masks = self.get_targets( gt_bboxes_3d, gt_labels_3d) loss_dict = dict() for task_id, preds_dict in enumerate(preds_dicts): # heatmap focal loss preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap']) num_pos = heatmaps[task_id].eq(1).float().sum().item() loss_heatmap = self.loss_cls( preds_dict[0]['heatmap'], heatmaps[task_id], avg_factor=max(num_pos, 1)) target_box = anno_boxes[task_id] # reconstruct the anno_box from multiple reg heads preds_dict[0]['anno_box'] = torch.cat( (preds_dict[0]['reg'], preds_dict[0]['height'], preds_dict[0]['dim'], preds_dict[0]['rot'], preds_dict[0]['vel']), dim=1) # Regression loss for dimension, offset, height, rotation ind = inds[task_id] num = masks[task_id].float().sum() pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous() pred = pred.view(pred.size(0), -1, pred.size(3)) pred = self._gather_feat(pred, ind) mask = masks[task_id].unsqueeze(2).expand_as(target_box).float() isnotnan = (~torch.isnan(target_box)).float() mask *= isnotnan code_weights = self.train_cfg.get('code_weights', None) bbox_weights = mask * mask.new_tensor(code_weights) loss_bbox = self.loss_bbox( pred, target_box, bbox_weights, avg_factor=(num + 1e-4)) loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox return loss_dict def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ rets = [] for task_id, preds_dict in enumerate(preds_dicts): num_class_with_bg = self.num_classes[task_id] batch_size = preds_dict[0]['heatmap'].shape[0] batch_heatmap = preds_dict[0]['heatmap'].sigmoid() batch_reg = preds_dict[0]['reg'] batch_hei = preds_dict[0]['height'] if self.norm_bbox: batch_dim = torch.exp(preds_dict[0]['dim']) else: batch_dim = preds_dict[0]['dim'] batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1) batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1) if 'vel' in preds_dict[0]: batch_vel = preds_dict[0]['vel'] else: batch_vel = None temp = self.bbox_coder.decode( batch_heatmap, batch_rots, batch_rotc, batch_hei, batch_dim, batch_vel, reg=batch_reg, task_id=task_id) assert self.test_cfg['nms_type'] in ['circle', 'rotate'] batch_reg_preds = [box['bboxes'] for box in temp] batch_cls_preds = [box['scores'] for box in temp] batch_cls_labels = [box['labels'] for box in temp] if self.test_cfg['nms_type'] == 'circle': ret_task = [] for i in range(batch_size): boxes3d = temp[i]['bboxes'] scores = temp[i]['scores'] labels = temp[i]['labels'] centers = boxes3d[:, [0, 1]] boxes = torch.cat([centers, scores.view(-1, 1)], dim=1) keep = torch.tensor( circle_nms( boxes.detach().cpu().numpy(), self.test_cfg['min_radius'][task_id], post_max_size=self.test_cfg['post_max_size']), dtype=torch.long, device=boxes.device) boxes3d = boxes3d[keep] scores = scores[keep] labels = labels[keep] ret = dict(bboxes=boxes3d, scores=scores, labels=labels) ret_task.append(ret) rets.append(ret_task) else: rets.append( self.get_task_detections(num_class_with_bg, batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas)) # Merge branches results num_samples = len(rets[0]) ret_list = [] for i in range(num_samples): for k in rets[0][i].keys(): if k == 'bboxes': bboxes = torch.cat([ret[i][k] for ret in rets]) bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 bboxes = img_metas[i]['box_type_3d']( bboxes, self.bbox_coder.code_size) elif k == 'scores': scores = torch.cat([ret[i][k] for ret in rets]) elif k == 'labels': flag = 0 for j, num_class in enumerate(self.num_classes): rets[j][i][k] += flag flag += num_class labels = torch.cat([ret[i][k].int() for ret in rets]) ret_list.append([bboxes, scores, labels]) return ret_list def get_task_detections(self, num_class_with_bg, batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas): """Rotate nms for each task. Args: num_class_with_bg (int): Number of classes for the current task. batch_cls_preds (list[torch.Tensor]): Prediction score with the shape of [N]. batch_reg_preds (list[torch.Tensor]): Prediction bbox with the shape of [N, 9]. batch_cls_labels (list[torch.Tensor]): Prediction label with the shape of [N]. img_metas (list[dict]): Meta information of each sample. Returns: list[dict[str: torch.Tensor]]: contains the following keys: -bboxes (torch.Tensor): Prediction bboxes after nms with the \ shape of [N, 9]. -scores (torch.Tensor): Prediction scores after nms with the \ shape of [N]. -labels (torch.Tensor): Prediction labels after nms with the \ shape of [N]. """ predictions_dicts = [] post_center_range = self.test_cfg['post_center_limit_range'] if len(post_center_range) > 0: post_center_range = torch.tensor( post_center_range, dtype=batch_reg_preds[0].dtype, device=batch_reg_preds[0].device) for i, (box_preds, cls_preds, cls_labels) in enumerate( zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)): # Apply NMS in birdeye view # get highest score per prediction, than apply nms # to remove overlapped box. if num_class_with_bg == 1: top_scores = cls_preds.squeeze(-1) top_labels = torch.zeros( cls_preds.shape[0], device=cls_preds.device, dtype=torch.long) else: top_labels = cls_labels.long() top_scores = cls_preds.squeeze(-1) if self.test_cfg['score_threshold'] > 0.0: thresh = torch.tensor( [self.test_cfg['score_threshold']], device=cls_preds.device).type_as(cls_preds) top_scores_keep = top_scores >= thresh top_scores = top_scores.masked_select(top_scores_keep) if top_scores.shape[0] != 0: if self.test_cfg['score_threshold'] > 0.0: box_preds = box_preds[top_scores_keep] top_labels = top_labels[top_scores_keep] boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d']( box_preds[:, :], self.bbox_coder.code_size).bev) # the nms in 3d detection just remove overlap boxes. selected = nms_gpu( boxes_for_nms, top_scores, thresh=self.test_cfg['nms_thr'], pre_maxsize=self.test_cfg['pre_max_size'], post_max_size=self.test_cfg['post_max_size']) else: selected = [] # if selected is not None: selected_boxes = box_preds[selected] selected_labels = top_labels[selected] selected_scores = top_scores[selected] # finally generate predictions. if selected_boxes.shape[0] != 0: box_preds = selected_boxes scores = selected_scores label_preds = selected_labels final_box_preds = box_preds final_scores = scores final_labels = label_preds if post_center_range is not None: mask = (final_box_preds[:, :3] >= post_center_range[:3]).all(1) mask &= (final_box_preds[:, :3] <= post_center_range[3:]).all(1) predictions_dict = dict( bboxes=final_box_preds[mask], scores=final_scores[mask], labels=final_labels[mask]) else: predictions_dict = dict( bboxes=final_box_preds, scores=final_scores, labels=final_labels) else: dtype = batch_reg_preds[0].dtype device = batch_reg_preds[0].device predictions_dict = dict( bboxes=torch.zeros([0, self.bbox_coder.code_size], dtype=dtype, device=device), scores=torch.zeros([0], dtype=dtype, device=device), labels=torch.zeros([0], dtype=top_labels.dtype, device=device)) predictions_dicts.append(predictions_dict) return predictions_dicts ================================================ FILE: mmdet3d/models/dense_heads/free_anchor3d_head.py ================================================ import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core.bbox import bbox_overlaps_nearest_3d from mmdet.models import HEADS from .anchor3d_head import Anchor3DHead from .train_mixins import get_direction_target @HEADS.register_module() class FreeAnchor3DHead(Anchor3DHead): r"""`FreeAnchor `_ head for 3D detection. Note: This implementation is directly modified from the `mmdet implementation `_ # noqa We find it also works on 3D detection with minor modification, i.e., different hyper-parameters and a additional direction classifier. Args: pre_anchor_topk (int): Number of boxes that be token in each bag. bbox_thr (float): The threshold of the saturated linear function. It is usually the same with the IoU threshold used in NMS. gamma (float): Gamma parameter in focal loss. alpha (float): Alpha parameter in focal loss. kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`. """ def __init__(self, pre_anchor_topk=50, bbox_thr=0.6, gamma=2.0, alpha=0.5, **kwargs): super().__init__(**kwargs) self.pre_anchor_topk = pre_anchor_topk self.bbox_thr = bbox_thr self.gamma = gamma self.alpha = alpha @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate loss of FreeAnchor head. Args: cls_scores (list[torch.Tensor]): Classification scores of different samples. bbox_preds (list[torch.Tensor]): Box predictions of different samples dir_cls_preds (list[torch.Tensor]): Direction predictions of different samples gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels (list[torch.Tensor]): Ground truth labels. input_metas (list[dict]): List of input meta information. gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth boxes that should be ignored. Defaults to None. Returns: dict[str, torch.Tensor]: Loss items. - positive_bag_loss (torch.Tensor): Loss of positive samples. - negative_bag_loss (torch.Tensor): Loss of negative samples. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels anchor_list = self.get_anchors(featmap_sizes, input_metas) anchors = [torch.cat(anchor) for anchor in anchor_list] # concatenate each level cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape( cls_score.size(0), -1, self.num_classes) for cls_score in cls_scores ] bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape( bbox_pred.size(0), -1, self.box_code_size) for bbox_pred in bbox_preds ] dir_cls_preds = [ dir_cls_pred.permute(0, 2, 3, 1).reshape(dir_cls_pred.size(0), -1, 2) for dir_cls_pred in dir_cls_preds ] cls_scores = torch.cat(cls_scores, dim=1) bbox_preds = torch.cat(bbox_preds, dim=1) dir_cls_preds = torch.cat(dir_cls_preds, dim=1) cls_prob = torch.sigmoid(cls_scores) box_prob = [] num_pos = 0 positive_losses = [] for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_, dir_cls_preds_) in enumerate( zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds, dir_cls_preds)): gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device) with torch.no_grad(): # box_localization: a_{j}^{loc}, shape: [j, 4] pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_) # object_box_iou: IoU_{ij}^{loc}, shape: [i, j] object_box_iou = bbox_overlaps_nearest_3d( gt_bboxes_, pred_boxes) # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j] t1 = self.bbox_thr t2 = object_box_iou.max( dim=1, keepdim=True).values.clamp(min=t1 + 1e-12) object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp( min=0, max=1) # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j] num_obj = gt_labels_.size(0) indices = torch.stack( [torch.arange(num_obj).type_as(gt_labels_), gt_labels_], dim=0) object_cls_box_prob = torch.sparse_coo_tensor( indices, object_box_prob) # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j] """ from "start" to "end" implement: image_box_iou = torch.sparse.max(object_cls_box_prob, dim=0).t() """ # start box_cls_prob = torch.sparse.sum( object_cls_box_prob, dim=0).to_dense() indices = torch.nonzero(box_cls_prob, as_tuple=False).t_() if indices.numel() == 0: image_box_prob = torch.zeros( anchors_.size(0), self.num_classes).type_as(object_box_prob) else: nonzero_box_prob = torch.where( (gt_labels_.unsqueeze(dim=-1) == indices[0]), object_box_prob[:, indices[1]], torch.tensor( [0]).type_as(object_box_prob)).max(dim=0).values # upmap to shape [j, c] image_box_prob = torch.sparse_coo_tensor( indices.flip([0]), nonzero_box_prob, size=(anchors_.size(0), self.num_classes)).to_dense() # end box_prob.append(image_box_prob) # construct bags for objects match_quality_matrix = bbox_overlaps_nearest_3d( gt_bboxes_, anchors_) _, matched = torch.topk( match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False) del match_quality_matrix # matched_cls_prob: P_{ij}^{cls} matched_cls_prob = torch.gather( cls_prob_[matched], 2, gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk, 1)).squeeze(2) # matched_box_prob: P_{ij}^{loc} matched_anchors = anchors_[matched] matched_object_targets = self.bbox_coder.encode( matched_anchors, gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors)) # direction classification loss loss_dir = None if self.use_direction_classifier: # also calculate direction prob: P_{ij}^{dir} matched_dir_targets = get_direction_target( matched_anchors, matched_object_targets, self.dir_offset, one_hot=False) loss_dir = self.loss_dir( dir_cls_preds_[matched].transpose(-2, -1), matched_dir_targets, reduction_override='none') # generate bbox weights if self.diff_rad_by_sin: bbox_preds_[matched], matched_object_targets = \ self.add_sin_difference( bbox_preds_[matched], matched_object_targets) bbox_weights = matched_anchors.new_ones(matched_anchors.size()) # Use pop is not right, check performance code_weight = self.train_cfg.get('code_weight', None) if code_weight: bbox_weights = bbox_weights * bbox_weights.new_tensor( code_weight) loss_bbox = self.loss_bbox( bbox_preds_[matched], matched_object_targets, bbox_weights, reduction_override='none').sum(-1) if loss_dir is not None: loss_bbox += loss_dir matched_box_prob = torch.exp(-loss_bbox) # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )} num_pos += len(gt_bboxes_) positive_losses.append( self.positive_bag_loss(matched_cls_prob, matched_box_prob)) positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos) # box_prob: P{a_{j} \in A_{+}} box_prob = torch.stack(box_prob, dim=0) # negative_loss: # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B|| negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max( 1, num_pos * self.pre_anchor_topk) losses = { 'positive_bag_loss': positive_loss, 'negative_bag_loss': negative_loss } return losses def positive_bag_loss(self, matched_cls_prob, matched_box_prob): """Generate positive bag loss. Args: matched_cls_prob (torch.Tensor): Classification probability of matched positive samples. matched_box_prob (torch.Tensor): Bounding box probability of matched positive samples. Returns: torch.Tensor: Loss of positive samples. """ # bag_prob = Mean-max(matched_prob) matched_prob = matched_cls_prob * matched_box_prob weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None) weight /= weight.sum(dim=1).unsqueeze(dim=-1) bag_prob = (weight * matched_prob).sum(dim=1) # positive_bag_loss = -self.alpha * log(bag_prob) bag_prob = bag_prob.clamp(0, 1) # to avoid bug of BCE, check return self.alpha * F.binary_cross_entropy( bag_prob, torch.ones_like(bag_prob), reduction='none') def negative_bag_loss(self, cls_prob, box_prob): """Generate negative bag loss. Args: cls_prob (torch.Tensor): Classification probability of negative samples. box_prob (torch.Tensor): Bounding box probability of negative samples. Returns: torch.Tensor: Loss of negative samples. """ prob = cls_prob * (1 - box_prob) prob = prob.clamp(0, 1) # to avoid bug of BCE, check negative_bag_loss = prob**self.gamma * F.binary_cross_entropy( prob, torch.zeros_like(prob), reduction='none') return (1 - self.alpha) * negative_bag_loss ================================================ FILE: mmdet3d/models/dense_heads/parta2_rpn_head.py ================================================ from __future__ import division import numpy as np import torch from mmcv.runner import force_fp32 from mmdet3d.core import limit_period, xywhr2xyxyr from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu from mmdet.models import HEADS from .anchor3d_head import Anchor3DHead @HEADS.register_module() class PartA2RPNHead(Anchor3DHead): """RPN head for PartA2. Note: The main difference between the PartA2 RPN head and the Anchor3DHead lies in their output during inference. PartA2 RPN head further returns the original classification score for the second stage since the bbox head in RoI head does not do classification task. Different from RPN heads in 2D detectors, this RPN head does multi-class classification task and uses FocalLoss like the SECOND and PointPillars do. But this head uses class agnostic nms rather than multi-class nms. Args: num_classes (int): Number of classes. in_channels (int): Number of channels in the input feature map. train_cfg (dict): Train configs. test_cfg (dict): Test configs. feat_channels (int): Number of channels of the feature map. use_direction_classifier (bool): Whether to add a direction classifier. anchor_generator(dict): Config dict of anchor generator. assigner_per_size (bool): Whether to do assignment for each separate anchor size. assign_per_class (bool): Whether to do assignment for each class. diff_rad_by_sin (bool): Whether to change the difference into sin difference for box regression loss. dir_offset (float | int): The offset of BEV rotation angles (TODO: may be moved into box coder) dir_limit_offset (float | int): The limited range of BEV rotation angles. (TODO: may be moved into box coder) bbox_coder (dict): Config dict of box coders. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_dir (dict): Config of direction classifier loss. """ def __init__(self, num_classes, in_channels, train_cfg, test_cfg, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], sizes=[[1.6, 3.9, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, dir_offset=0, dir_limit_offset=1, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)): super().__init__(num_classes, in_channels, train_cfg, test_cfg, feat_channels, use_direction_classifier, anchor_generator, assigner_per_size, assign_per_class, diff_rad_by_sin, dir_offset, dir_limit_offset, bbox_coder, loss_cls, loss_bbox, loss_dir) @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes \ of each sample. gt_labels (list[torch.Tensor]): Labels of each sample. input_metas (list[dict]): Point cloud and image's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and \ direction losses of each level. - loss_rpn_cls (list[torch.Tensor]): Classification losses. - loss_rpn_bbox (list[torch.Tensor]): Box regression losses. - loss_rpn_dir (list[torch.Tensor]): Direction classification \ losses. """ loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore) # change the loss key names to avoid conflict return dict( loss_rpn_cls=loss_dict['loss_cls'], loss_rpn_bbox=loss_dict['loss_bbox'], loss_rpn_dir=loss_dict['loss_dir']) def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (None | :obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: dict: Predictions of single batch containing the following keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores_3d (torch.Tensor): Score of each bbox. - labels_3d (torch.Tensor): Label of each bbox. - cls_preds (torch.Tensor): Class score of each bbox. """ assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_max_scores = [] mlvl_label_pred = [] mlvl_dir_scores = [] mlvl_cls_score = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, self.box_code_size) nms_pre = cfg.get('nms_pre', -1) if self.use_sigmoid_cls: max_scores, pred_labels = scores.max(dim=1) else: max_scores, pred_labels = scores[:, :-1].max(dim=1) # get topk if nms_pre > 0 and scores.shape[0] > nms_pre: topk_scores, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] max_scores = topk_scores cls_score = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] pred_labels = pred_labels[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_max_scores.append(max_scores) mlvl_cls_score.append(cls_score) mlvl_label_pred.append(pred_labels) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_max_scores = torch.cat(mlvl_max_scores) mlvl_label_pred = torch.cat(mlvl_label_pred) mlvl_dir_scores = torch.cat(mlvl_dir_scores) # shape [k, num_class] before sigmoid # PartA2 need to keep raw classification score # becase the bbox head in the second stage does not have # classification branch, # roi head need this score as classification score mlvl_cls_score = torch.cat(mlvl_cls_score) score_thr = cfg.get('score_thr', 0) result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_max_scores, mlvl_label_pred, mlvl_cls_score, mlvl_dir_scores, score_thr, cfg.nms_post, cfg, input_meta) return result def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_max_scores, mlvl_label_pred, mlvl_cls_score, mlvl_dir_scores, score_thr, max_num, cfg, input_meta): """Class agnostic nms for single batch. Args: mlvl_bboxes (torch.Tensor): Bboxes from Multi-level. mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms (bev or minmax boxes) from Multi-level. mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox. mlvl_label_pred (torch.Tensor): Class predictions of Multi-level bbox. mlvl_cls_score (torch.Tensor): Class scores of Multi-level bbox. mlvl_dir_scores (torch.Tensor): Direction scores of Multi-level bbox. score_thr (int): Score threshold. max_num (int): Max number of bboxes after nms. cfg (None | :obj:`ConfigDict`): Training or testing config. input_meta (dict): Contain pcd and img's meta info. Returns: dict: Predictions of single batch. Contain the keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores_3d (torch.Tensor): Score of each bbox. - labels_3d (torch.Tensor): Label of each bbox. - cls_preds (torch.Tensor): Class score of each bbox. """ bboxes = [] scores = [] labels = [] dir_scores = [] cls_scores = [] score_thr_inds = mlvl_max_scores > score_thr _scores = mlvl_max_scores[score_thr_inds] _bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :] if cfg.use_rotate_nms: nms_func = nms_gpu else: nms_func = nms_normal_gpu selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr) _mlvl_bboxes = mlvl_bboxes[score_thr_inds, :] _mlvl_dir_scores = mlvl_dir_scores[score_thr_inds] _mlvl_label_pred = mlvl_label_pred[score_thr_inds] _mlvl_cls_score = mlvl_cls_score[score_thr_inds] if len(selected) > 0: bboxes.append(_mlvl_bboxes[selected]) scores.append(_scores[selected]) labels.append(_mlvl_label_pred[selected]) cls_scores.append(_mlvl_cls_score[selected]) dir_scores.append(_mlvl_dir_scores[selected]) dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[-1][..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores[-1].to(bboxes[-1].dtype)) if bboxes: bboxes = torch.cat(bboxes, dim=0) scores = torch.cat(scores, dim=0) cls_scores = torch.cat(cls_scores, dim=0) labels = torch.cat(labels, dim=0) dir_scores = torch.cat(dir_scores, dim=0) if bboxes.shape[0] > max_num: _, inds = scores.sort(descending=True) inds = inds[:max_num] bboxes = bboxes[inds, :] labels = labels[inds] scores = scores[inds] cls_scores = cls_scores[inds] bboxes = input_meta['box_type_3d']( bboxes, box_dim=self.box_code_size) return dict( boxes_3d=bboxes, scores_3d=scores, labels_3d=labels, cls_preds=cls_scores # raw scores [max_num, cls_num] ) else: return dict( boxes_3d=input_meta['box_type_3d']( mlvl_bboxes.new_zeros([0, self.box_code_size]), box_dim=self.box_code_size), scores_3d=mlvl_bboxes.new_zeros([0]), labels_3d=mlvl_bboxes.new_zeros([0]), cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]])) ================================================ FILE: mmdet3d/models/dense_heads/shape_aware_head.py ================================================ import numpy as np import torch from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init from torch import nn as nn from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr from mmdet.core import multi_apply from mmdet.models import HEADS from ..builder import build_head from .anchor3d_head import Anchor3DHead @HEADS.register_module() class BaseShapeHead(nn.Module): """Base Shape-aware Head in Shape Signature Network. Note: This base shape-aware grouping head uses default settings for small objects. For large and huge objects, it is recommended to use heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared conv strides. For tiny objects, we can use smaller heads, like (32, 32) channels and (1, 1) strides. Args: num_cls (int): Number of classes. num_base_anchors (int): Number of anchors per location. box_code_size (int): The dimension of boxes to be encoded. in_channels (int): Input channels for convolutional layers. shared_conv_channels (tuple): Channels for shared convolutional \ layers. Default: (64, 64). \ shared_conv_strides (tuple): Strides for shared convolutional \ layers. Default: (1, 1). use_direction_classifier (bool, optional): Whether to use direction \ classifier. Default: True. conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). bias (bool|str, optional): Type of bias. Default: False. """ def __init__(self, num_cls, num_base_anchors, box_code_size, in_channels, shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), use_direction_classifier=True, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias=False): super().__init__() self.num_cls = num_cls self.num_base_anchors = num_base_anchors self.use_direction_classifier = use_direction_classifier self.box_code_size = box_code_size assert len(shared_conv_channels) == len(shared_conv_strides), \ 'Lengths of channels and strides list should be equal.' self.shared_conv_channels = [in_channels] + list(shared_conv_channels) self.shared_conv_strides = list(shared_conv_strides) shared_conv = [] for i in range(len(self.shared_conv_strides)): shared_conv.append( ConvModule( self.shared_conv_channels[i], self.shared_conv_channels[i + 1], kernel_size=3, stride=self.shared_conv_strides[i], padding=1, conv_cfg=conv_cfg, bias=bias, norm_cfg=norm_cfg)) self.shared_conv = nn.Sequential(*shared_conv) out_channels = self.shared_conv_channels[-1] self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1) self.conv_reg = nn.Conv2d(out_channels, num_base_anchors * box_code_size, 1) if use_direction_classifier: self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2, 1) def init_weights(self): """Initialize weights.""" bias_cls = bias_init_with_prob(0.01) # shared conv layers have already been initialized by ConvModule normal_init(self.conv_cls, std=0.01, bias=bias_cls) normal_init(self.conv_reg, std=0.01) if self.use_direction_classifier: normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls) def forward(self, x): """Forward function for SmallHead. Args: x (torch.Tensor): Input feature map with the shape of [B, C, H, W]. Returns: dict[torch.Tensor]: Contain score of each class, bbox \ regression and direction classification predictions. \ Note that all the returned tensors are reshaped as \ [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. \ It is more convenient to concat anchors for different \ classes even though they have different feature map sizes. """ x = self.shared_conv(x) cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) featmap_size = bbox_pred.shape[-2:] H, W = featmap_size B = bbox_pred.shape[0] cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, self.num_cls) bbox_pred = bbox_pred.view(-1, self.num_base_anchors, self.box_code_size, H, W).permute( 0, 1, 3, 4, 2).reshape(B, -1, self.box_code_size) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = self.conv_dir_cls(x) dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, 2) ret = dict( cls_score=cls_score, bbox_pred=bbox_pred, dir_cls_preds=dir_cls_preds, featmap_size=featmap_size) return ret @HEADS.register_module() class ShapeAwareHead(Anchor3DHead): """Shape-aware grouping head for SSN. Args: tasks (dict): Shape-aware groups of multi-class objects. assign_per_class (bool, optional): Whether to do assignment for each \ class. Default: True. kwargs (dict): Other arguments are the same as those in \ :class:`Anchor3DHead`. """ def __init__(self, tasks, assign_per_class=True, **kwargs): self.tasks = tasks self.featmap_sizes = [] super().__init__(assign_per_class=assign_per_class, **kwargs) def _init_layers(self): """Initialize neural network layers of the head.""" self.heads = nn.ModuleList() cls_ptr = 0 for task in self.tasks: sizes = self.anchor_generator.sizes[cls_ptr:cls_ptr + task['num_class']] num_size = torch.tensor(sizes).reshape(-1, 3).size(0) num_rot = len(self.anchor_generator.rotations) num_base_anchors = num_rot * num_size branch = dict( type='BaseShapeHead', num_cls=self.num_classes, num_base_anchors=num_base_anchors, box_code_size=self.box_code_size, in_channels=self.in_channels, shared_conv_channels=task['shared_conv_channels'], shared_conv_strides=task['shared_conv_strides']) self.heads.append(build_head(branch)) cls_ptr += task['num_class'] def init_weights(self): """Initialize the weights of head.""" for head in self.heads: head.init_weights() def forward_single(self, x): """Forward function on a single-scale feature map. Args: x (torch.Tensor): Input features. Returns: tuple[torch.Tensor]: Contain score of each class, bbox \ regression and direction classification predictions. """ results = [] for head in self.heads: results.append(head(x)) cls_score = torch.cat([result['cls_score'] for result in results], dim=1) bbox_pred = torch.cat([result['bbox_pred'] for result in results], dim=1) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = torch.cat( [result['dir_cls_preds'] for result in results], dim=1) self.featmap_sizes = [] for i, task in enumerate(self.tasks): for _ in range(task['num_class']): self.featmap_sizes.append(results[i]['featmap_size']) assert len(self.featmap_sizes) == len(self.anchor_generator.ranges), \ 'Length of feature map sizes must be equal to length of ' + \ 'different ranges of anchor generator.' return cls_score, bbox_pred, dir_cls_preds def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, num_total_samples): """Calculate loss of Single-level results. Args: cls_score (torch.Tensor): Class score in single-level. bbox_pred (torch.Tensor): Bbox prediction in single-level. dir_cls_preds (torch.Tensor): Predictions of direction class in single-level. labels (torch.Tensor): Labels of class. label_weights (torch.Tensor): Weights of class loss. bbox_targets (torch.Tensor): Targets of bbox predictions. bbox_weights (torch.Tensor): Weights of bbox loss. dir_targets (torch.Tensor): Targets of direction predictions. dir_weights (torch.Tensor): Weights of direction loss. num_total_samples (int): The number of valid samples. Returns: tuple[torch.Tensor]: Losses of class, bbox \ and direction, respectively. """ # classification loss if num_total_samples is None: num_total_samples = int(cls_score.shape[0]) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.reshape(-1, self.num_classes) loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples) # regression loss bbox_targets = bbox_targets.reshape(-1, self.box_code_size) bbox_weights = bbox_weights.reshape(-1, self.box_code_size) code_weight = self.train_cfg.get('code_weight', None) if code_weight: bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight) bbox_pred = bbox_pred.reshape(-1, self.box_code_size) if self.diff_rad_by_sin: bbox_pred, bbox_targets = self.add_sin_difference( bbox_pred, bbox_targets) loss_bbox = self.loss_bbox( bbox_pred, bbox_targets, bbox_weights, avg_factor=num_total_samples) # direction classification loss loss_dir = None if self.use_direction_classifier: dir_cls_preds = dir_cls_preds.reshape(-1, 2) dir_targets = dir_targets.reshape(-1) dir_weights = dir_weights.reshape(-1) loss_dir = self.loss_dir( dir_cls_preds, dir_targets, dir_weights, avg_factor=num_total_samples) return loss_cls, loss_bbox, loss_dir def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and \ direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - loss_dir (list[torch.Tensor]): Direction classification \ losses. """ device = cls_scores[0].device anchor_list = self.get_anchors( self.featmap_sizes, input_metas, device=device) cls_reg_targets = self.anchor_target_3d( anchor_list, gt_bboxes, input_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, num_classes=self.num_classes, sampling=self.sampling) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = ( num_total_pos + num_total_neg if self.sampling else num_total_pos) # num_total_samples = None losses_cls, losses_bbox, losses_dir = multi_apply( self.loss_single, cls_scores, bbox_preds, dir_cls_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_samples=num_total_samples) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, input_metas, cfg=None, rescale=False): """Get bboxes of anchor head. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. cfg (None | :obj:`ConfigDict`): Training or testing config. Default: None. rescale (list[torch.Tensor], optional): Whether to rescale bbox. Default: False. Returns: list[tuple]: Prediction resultes of batches. """ assert len(cls_scores) == len(bbox_preds) assert len(cls_scores) == len(dir_cls_preds) num_levels = len(cls_scores) assert num_levels == 1, 'Only support single level inference.' device = cls_scores[0].device mlvl_anchors = self.anchor_generator.grid_anchors( self.featmap_sizes, device=device) # `anchor` is a list of anchors for different classes mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors] result_list = [] for img_id in range(len(input_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] input_meta = input_metas[img_id] proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, dir_cls_pred_list, mlvl_anchors, input_meta, cfg, rescale) result_list.append(proposals) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg=None, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (None | :obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor], optional): whether to rescale bbox. \ Default: False. Returns: tuple: Contain predictions of single batch. - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores (torch.Tensor): Class score of each bbox. - labels (torch.Tensor): Label of each bbox. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2] == bbox_pred.size()[-2] assert cls_score.size()[-2] == dir_cls_pred.size()[-2] dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_scores = torch.cat(mlvl_scores) mlvl_dir_scores = torch.cat(mlvl_dir_scores) if self.use_sigmoid_cls: # Add a dummy background class to the front when using sigmoid padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) score_thr = cfg.get('score_thr', 0) results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, cfg.max_num, cfg, mlvl_dir_scores) bboxes, scores, labels, dir_scores = results if bboxes.shape[0] > 0: dir_rot = limit_period(bboxes[..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores.to(bboxes.dtype)) bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size) return bboxes, scores, labels ================================================ FILE: mmdet3d/models/dense_heads/sparsefusion_head_deform.py ================================================ import copy import numpy as np import torch import functools import pickle import os from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init from mmcv.runner import force_fp32 from torch import nn import torch.nn.functional as F import time from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr, limit_period, PseudoSampler, BboxOverlaps3D) from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu from mmdet3d.models.utils import clip_sigmoid, inverse_sigmoid from mmdet3d.models.fusion_layers import apply_3d_transformation from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult from mmdet3d.models.utils import FFN, TransformerDecoderLayer, PositionEmbeddingLearned, PositionEmbeddingLearnedwoNorm,\ PointTransformer2D_3D, ImageTransformer_Cam_3D_MS, ProjectionLayerNorm, FusionTransformer2D_3D_Self, \ ViewTransformer, DepthEncoderResNet, LayerNorm, ConvLN, FFNLN, normalize_pos from mmdet3d.models.utils.ops.modules import MSDeformAttn from mmdet3d.models.utils.deformable_decoder import DeformableTransformerDecoderLayer @HEADS.register_module() class SparseFusionHead2D_Deform(nn.Module): def __init__(self, num_views=0, in_channels_img=64, out_size_factor_img=4, num_proposals=128, num_img_proposals=128, in_channels=128 * 3, hidden_channel=128, num_classes=4, # config for Transformer num_pts_decoder_layers=1, num_img_decoder_layers=1, num_fusion_decoder_layers=1, num_heads=8, initialize_by_heatmap=True, semantic_transfer=True, cross_only=True, range_num=5, cross_heatmap_layer=1, img_heatmap_layer=2, img_reg_layer=3, nms_kernel_size=3, img_nms_kernel_size=3, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', # config for FFN common_heads=dict(), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), bias='auto', # loss loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean'), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean'), loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean'), loss_center_2d=dict(type='L1Loss', reduction='mean'), # others train_cfg=None, test_cfg=None, bbox_coder=None, bbox_2d_coder=None, use_camera='se', level_num=4, img_reg_bn=False, geometric_transfer=True, view_transform=True, depth_input_channel=2, ): super(SparseFusionHead2D_Deform, self).__init__() self.num_proposals = num_proposals self.num_img_proposals = num_img_proposals self.num_classes = num_classes self.bbox_coder = build_bbox_coder(bbox_coder) self.bbox_2d_coder = build_bbox_coder(bbox_2d_coder) self.bn_momentum = bn_momentum self.train_cfg = train_cfg self.test_cfg = test_cfg self.initialize_by_heatmap = initialize_by_heatmap self.semantic_transfer = semantic_transfer self.cross_only = cross_only self.level_num = level_num self.in_channels_img = in_channels_img self.view_transform = view_transform self.range_num = range_num self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_heatmap = build_loss(loss_heatmap) self.loss_heatmap_2d = build_loss(loss_heatmap_2d) self.loss_center_2d = build_loss(loss_center_2d) self.num_img_decoder_layers = num_img_decoder_layers self.num_pts_decoder_layers = num_pts_decoder_layers self.num_fusion_decoder_layers = num_fusion_decoder_layers self.hidden_channel = hidden_channel self.sampling = False self.out_size_factor_img = out_size_factor_img self.geometric_transfer = geometric_transfer self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) if not self.use_sigmoid_cls: self.num_classes += 1 heads3d = copy.deepcopy(common_heads) heads3d.update(dict(heatmap=(self.num_classes, 2))) pts_prediction_heads = FFN(hidden_channel, heads3d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) fusion_heads = dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2), heatmap=(self.num_classes, 2)) fusion_prediction_heads = FFN(hidden_channel, fusion_heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) heads2d = dict(center_2d=(2, img_reg_layer, img_reg_bn), depth_2d=(1, img_reg_layer, img_reg_bn), cls=(self.num_classes, 2), dim_2d=(3, img_reg_layer, img_reg_bn), rot_2d=(2, img_reg_layer, img_reg_bn), vel_2d=(2, img_reg_layer, img_reg_bn) ) # img_prediction_heads = FFN(hidden_channel, heads2d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) img_prediction_heads = FFNLN(hidden_channel, heads2d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) pts_query_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(num_pts_decoder_layers)] pts_key_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(num_pts_decoder_layers)] self.point_transformer = PointTransformer2D_3D( hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_pts_decoder_layers, prediction_heads=pts_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg, query_pos=pts_query_pos_embed, key_pos=pts_key_pos_embed ) img_query_pos_embed = [PositionEmbeddingLearnedwoNorm(2, hidden_channel) for _ in range(num_img_decoder_layers)] img_key_pos_embed = [PositionEmbeddingLearnedwoNorm(2, hidden_channel) for _ in range(num_img_decoder_layers)] self.img_transformer = ImageTransformer_Cam_3D_MS( hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_img_decoder_layers, out_size_factor_img=out_size_factor_img, num_views=num_views, prediction_heads=img_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg, query_pos=img_query_pos_embed, key_pos=img_key_pos_embed ) if view_transform: heads_view = dict(center_view=(2, 2), height_view=(1, 2), dim_view=(3, 2), rot_view=(2, 2), vel_view=(2, 2), heatmap_view=(self.num_classes, 2)) view_prediction_heads = FFN(hidden_channel, heads_view, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) # view_prediction_heads = FFNLN(hidden_channel, heads_view, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) view_query_pos_embed = PositionEmbeddingLearnedwoNorm(9, hidden_channel) view_key_pos_embed = PositionEmbeddingLearnedwoNorm(9, hidden_channel) view_projection = ProjectionLayerNorm(hidden_channel) self.view_transformer = ViewTransformer( hidden_channel=hidden_channel, num_heads=num_heads, prediction_heads=view_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg, query_pos=view_query_pos_embed, key_pos=view_key_pos_embed, view_projection=view_projection, use_camera=use_camera ) fusion_query_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(self.num_fusion_decoder_layers)] fusion_key_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(self.num_fusion_decoder_layers)] fuse_pts_projection = ProjectionLayerNorm(hidden_channel) fuse_img_projection = ProjectionLayerNorm(hidden_channel) self.fusion_transformer = FusionTransformer2D_3D_Self( hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_fusion_decoder_layers, prediction_heads=fusion_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg, query_pos=fusion_query_pos_embed, key_pos=fusion_query_pos_embed, pts_projection=fuse_pts_projection, img_projection=fuse_img_projection, num_proposals=num_proposals ) if self.initialize_by_heatmap and self.semantic_transfer: self.heatmap_pts_proj = nn.Sequential( nn.Linear(hidden_channel, hidden_channel), nn.LayerNorm(hidden_channel) ) self.heatmap_img_proj = nn.Sequential( nn.Linear(hidden_channel, hidden_channel), nn.LayerNorm(hidden_channel) ) self.cross_heatmap_head = self.build_heatmap_LN(hidden_channel, bias, num_classes, layer_num=cross_heatmap_layer) colattn_query_pos = PositionEmbeddingLearnedwoNorm(3, hidden_channel) colattn_key_pos = PositionEmbeddingLearnedwoNorm(2, hidden_channel) self.cross_heatmap_decoder = DeformableTransformerDecoderLayer( hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation, self_posembed=colattn_query_pos, cross_posembed=colattn_key_pos, cross_only=False ) self.reduce_conv = ConvLN( hidden_channel+1, hidden_channel, kernel_size=3, padding=1 ) # a shared convolution self.shared_conv = build_conv_layer( dict(type='Conv2d'), in_channels, hidden_channel, kernel_size=3, padding=1, bias=bias, ) # transformer decoder layers for object query with LiDAR feature self.num_views = num_views if self.geometric_transfer: self.shared_conv_img = nn.Identity() blocks = [1] * self.level_num assert len(blocks) == self.level_num self.depth_resnet = DepthEncoderResNet(depth_input_channel, in_channels_img, hidden_channel, depth_layers=blocks) else: self.shared_conv_img = build_conv_layer( dict(type='Conv2d'), in_channels_img, # channel of img feature map hidden_channel, kernel_size=3, padding=1, bias=bias, ) # Position Embedding for Cross-Attention, which is re-used during training x_size = self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'] y_size = self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor'] self.bev_pos = self.create_2D_grid(x_size, y_size) if self.initialize_by_heatmap: self.heatmap_head = self.build_heatmap(hidden_channel, bias, num_classes) self.img_heatmap_head = nn.ModuleList() for lvl in range(self.level_num): self.img_heatmap_head.append(self.build_heatmap_LN(hidden_channel, bias, num_classes, layer_num=img_heatmap_layer)) self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1) self.img_class_encoding = nn.Conv1d(num_classes, hidden_channel, 1) else: # query feature self.pts_query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_proposals)) self.pts_query_pos = nn.Parameter(torch.rand([1, self.num_proposals, 2])*torch.Tensor([x_size, y_size]).reshape(1, 1, 2), requires_grad=True) self.img_query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_img_proposals)) self.img_query_pos = nn.Parameter(torch.rand([1, self.num_img_proposals, 2]), requires_grad=True) self.img_query_pos = inverse_sigmoid(self.img_query_pos) self.nms_kernel_size = nms_kernel_size self.img_nms_kernel_size = img_nms_kernel_size self.img_feat_pos = None self.img_feat_collapsed_pos = None self.init_weights() self._init_assigner_sampler() def create_2D_grid(self, x_size, y_size): meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]] batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) batch_x = batch_x + 0.5 batch_y = batch_y + 0.5 coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None] coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1) return coord_base def init_bn_momentum(self): for m in self.modules(): if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)): m.momentum = self.bn_momentum def init_weights(self): # initialize transformer for m in self.parameters(): if m.dim() > 1: nn.init.xavier_uniform_(m) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() self.init_bn_momentum() if self.geometric_transfer: level_pos = torch.zeros([self.level_num, self.hidden_channel]) else: level_pos = torch.zeros([self.level_num, self.in_channels_img]) self.level_pos = nn.Parameter(level_pos, requires_grad=True) torch.nn.init.normal_(self.level_pos) def _init_assigner_sampler(self): """Initialize the target assigner and sampler of the head.""" if self.train_cfg is None: return if self.sampling: self.bbox_sampler = build_sampler(self.train_cfg.sampler) else: self.bbox_sampler = PseudoSampler() if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] if isinstance(self.train_cfg.assigner_2d, dict): self.bbox_assigner_2d = build_assigner(self.train_cfg.assigner_2d) elif isinstance(self.train_cfg.assigner_2d, list): self.bbox_assigner_2d = [ build_assigner(res) for res in self.train_cfg.assigner_2d ] def forward_single(self, inputs, img_inputs, img_metas, sparse_depth): """ Args: inputs (torch.Tensor): Input feature map with the shape of [B, C, 128(H), 128(W)]. (consistent with L748) img_inputs (torch.Tensor): Input feature map with the shape of [B*num_view, C, image_H, image_W] sparse_depth (torch.Tensor): Input normalized depth with the shape of [B, num_views, num_scales, depth_C, depth_H, depth_W] Returns: list[dict]: Output results for tasks. """ batch_size = inputs.shape[0] sparse_depth = sparse_depth[:, :, 0, :2] if self.geometric_transfer: sparse_depth = sparse_depth.view(batch_size*self.num_views, 1, -1, sparse_depth.shape[-2], sparse_depth.shape[-1]) img_inputs = self.depth_resnet(sparse_depth[:, 0], img_inputs) img_feats = [] for i in range(self.level_num): img_inputs_level = img_inputs[i] + self.level_pos[i].reshape(1, self.level_pos[i].shape[0], 1, 1) img_feat = self.shared_conv_img(img_inputs_level) img_feats.append(img_feat) input_padding_mask = self.construct_input_padding_mask(img_feats, img_metas) # input_padding_mask = None img_feats_pos = [] normal_img_feats_pos = [] for lvl in range(self.level_num): h, w = img_feats[lvl].shape[-2], img_feats[lvl].shape[-1] img_feat_pos = self.create_2D_grid(h, w).to(img_feats[lvl].device) # (1, h*w, 2) img_feats_pos.append(img_feat_pos) normal_img_feat_pos = normalize_pos(img_feat_pos, w, h) # (1, h*w, 2) normal_img_feats_pos.append(normal_img_feat_pos) normal_img_feats_pos_stack = torch.cat(normal_img_feats_pos, dim=1) # (1, h*w (sum), 2) self.normal_img_feats_pos_stack = normal_img_feats_pos_stack normal_img_feats_pos_repeat = normal_img_feats_pos_stack.repeat(batch_size, 1, 1) proj_matrix = self.construct_projection_matrix(img_metas, normal_img_feats_pos_stack.device) inputs, min_voxel_height, max_voxel_height = inputs[:, :-2], inputs[:, -2], inputs[:, -1] lidar_feat = self.shared_conv(inputs) # [BS, C, H, W] ################################# # image to BEV ################################# lidar_feat_flatten = lidar_feat.view(batch_size, lidar_feat.shape[1], -1) # [BS, C, H*W] bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, H*W, 2] if self.initialize_by_heatmap: if self.semantic_transfer: img_feat_cross = [] for level in range(self.level_num): img_feat_cross.append(img_feats[level].clone()) else: img_feat_cross = None heatmap, dense_heatmap, pts_top_proposals_class, pts_top_proposals_index = self.generate_heatmap(lidar_feat.clone(), min_voxel_height, max_voxel_height, batch_size, img_metas, proj_matrix['lidar2img_rt'], img_feat_cross, input_padding_mask) pts_query_feat = lidar_feat_flatten.gather( index=pts_top_proposals_index[:, None, :].expand(-1, lidar_feat_flatten.shape[1], -1), dim=-1 ) # [BS, C, num_proposals] # add category embedding one_hot = F.one_hot(pts_top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) # [BS, num_classes, num_proposals] query_cat_encoding = self.class_encoding(one_hot.float()) # [BS, C, num_proposals] self.query_labels = pts_top_proposals_class pts_query_feat += query_cat_encoding pts_query_pos = bev_pos.gather( index=pts_top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), dim=1 ) # [BS, num_proposals, 2] else: pts_query_feat = self.pts_query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals] pts_query_pos = self.pts_query_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, num_proposals, 2] if self.initialize_by_heatmap: img_feats_heatmap = [] for lvl in range(self.level_num): img_feats_heatmap.append(img_feats[lvl].clone()) img_heatmap, img_dense_heatmap, img_top_proposals_class, img_top_proposals_index, img_top_proposals_view_idx, img_top_proposals_pos_id = \ self.generate_heatmap_img(img_feats_heatmap, batch_size) img_feats_flatten = [] for lvl in range(self.level_num): img_feat = img_feats[lvl] h, w = img_feat.shape[-2], img_feat.shape[-1] img_feat_flatten = img_feat.reshape(batch_size, self.num_views, self.hidden_channel, h * w) img_feat_flatten = img_feat_flatten.permute(0, 2, 1, 3) # [BS, C, num_view, h*w] img_feats_flatten.append(img_feat_flatten) img_feat_stack = torch.cat(img_feats_flatten, dim=-1) # [BS, C, num_view, h*w (sum)] img_feat_stack = img_feat_stack.view(batch_size, self.hidden_channel, self.num_views*img_feat_stack.shape[-1]) normal_img_query_pos = normal_img_feats_pos_repeat.gather( index=img_top_proposals_pos_id[:, None, :].permute(0, 2, 1).expand(-1, -1, normal_img_feats_pos_stack.shape[-1]), dim=1 ) # [BS, num_proposals, 2] img_query_feat = img_feat_stack.gather( index=img_top_proposals_index[:, None, :].expand(-1, img_feat_stack.shape[1], -1), dim=-1 ) # [BS, C, num_proposals] img_query_view = img_top_proposals_view_idx.clone() # [BS, num_proposals] one_hot = F.one_hot(img_top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) # [BS, num_classes, num_proposals] self.img_query_label = img_top_proposals_class img_query_cat_encoding = self.img_class_encoding(one_hot.float()) # [BS, C, num_proposals] img_query_feat += img_query_cat_encoding else: img_query_feat = self.img_query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals] normal_img_query_pos = self.img_query_pos.repeat(batch_size, 1, 1).to(img_feat.device) # [BS, num_proposals, 2] img_query_pos_view = torch.arange(self.num_img_proposals).reshape(1, -1).repeat(batch_size, 1).to(img_feat.device) img_query_view = img_query_pos_view % self.num_views view_proj_matrix = self.construction_view_projection_matrix(proj_matrix, img_query_view) ################################# # transformer decoder layer (LiDAR feature as K,V) ################################# ret_dicts = [] pts_query_feat, pts_query_pos, pts_ret_dicts = self.point_transformer(pts_query_feat, pts_query_pos, lidar_feat_flatten, bev_pos) ret_dicts.extend(pts_ret_dicts) ################################# # transformer decoder layer (img feature as K,V) ################################# img_query_feat, normal_img_query_pos, img_query_pos_bev, camera_info, img_ret_dicts = \ self.img_transformer(img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, view_proj_matrix['lidar2cam_rt'], view_proj_matrix['cam_intrinsic'], img_metas, input_padding_mask) ################################# # view transformation layer ################################# if self.view_transform: img_query_feat, img_query_pos_bev, view_ret_dicts = self.view_transformer(img_query_feat, img_query_pos_bev, normal_img_query_pos[..., :2], img_ret_dicts, camera_info) img_query_pos_bev = img_query_pos_bev[..., :2] ################################# # fusion layer ################################# all_query_feat, all_query_pos, fusion_ret_dicts = self.fusion_transformer(pts_query_feat, pts_query_pos, img_query_feat, img_query_pos_bev) ret_dicts.extend(fusion_ret_dicts) if self.initialize_by_heatmap: ret_dicts[0]['query_heatmap_score'] = heatmap.gather(index=pts_top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals] ret_dicts[0]['dense_heatmap'] = dense_heatmap ret_dicts[0]['img_query_heatmap_score'] = img_heatmap.gather(index=img_top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals] ret_dicts[0]['img_dense_heatmap'] = img_dense_heatmap # return all the layer's results for auxiliary superivison new_res = {} for key in ret_dicts[0].keys(): if key not in ['dense_heatmap', 'query_heatmap_score', 'img_query_heatmap_score', 'img_dense_heatmap']: new_res[key] = torch.cat([ret_dict[key] for ret_dict in ret_dicts], dim=-1) else: new_res[key] = ret_dicts[0][key] for key in img_ret_dicts[0].keys(): new_res[key] = torch.cat([ret_dict[key] for ret_dict in img_ret_dicts], dim=-1) new_res['view'] = img_query_view.repeat(1, self.num_img_decoder_layers) if self.view_transform: for key in view_ret_dicts[0].keys(): new_res[key] = torch.cat([ret_dict[key] for ret_dict in view_ret_dicts], dim=-1) return [new_res] def forward(self, feats, img_feats, img_metas, sparse_depth=None): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple(list[dict]): Output results. first index by level, second index by layer """ if img_feats is None: img_feats = [None] else: img_feats = [img_feats[:self.level_num]] if sparse_depth is None: sparse_depth = [None] else: sparse_depth = [sparse_depth[:, :, :self.level_num]] res = multi_apply(self.forward_single, feats, img_feats, [img_metas], sparse_depth) assert len(res) == 1, "only support one level features." return res def construct_input_padding_mask(self, img_feats, img_metas): batch_size = len(img_metas) device = img_feats[0].device img_h_lvl = [] img_w_lvl = [] for img_feat_lvl in img_feats: img_h_lvl.append(img_feat_lvl.shape[-2]) img_w_lvl.append(img_feat_lvl.shape[-1]) padding_mask = [] for sample_idx in range(batch_size): sample_mask = [] for view_idx in range(self.num_views): view_mask = [] if 'valid_shape' in img_metas[sample_idx]: valid_shape = img_metas[sample_idx]['valid_shape'][view_idx] / self.out_size_factor_img else: valid_shape = np.array([img_metas[sample_idx]['img_shape'][1], img_metas[sample_idx]['img_shape'][0]]) / self.out_size_factor_img for lvl_idx in range(self.level_num): lvl_mask = torch.ones([img_h_lvl[lvl_idx], img_w_lvl[lvl_idx]], dtype=torch.bool, device=device) valid_shape_lvl = valid_shape // (2 ** lvl_idx) valid_w_lvl = int(valid_shape_lvl[0]) valid_h_lvl = int(valid_shape_lvl[1]) lvl_mask[:valid_h_lvl, :valid_w_lvl] = False view_mask.append(lvl_mask.view(-1)) view_mask = torch.cat(view_mask) sample_mask.append(view_mask) sample_mask = torch.stack(sample_mask, dim=0) padding_mask.append(sample_mask) padding_mask = torch.stack(padding_mask, dim=0) return padding_mask def construction_view_projection_matrix(self, proj_matrix, img_query_view): view_proj_matrix = {} batch_size = img_query_view.shape[0] batch_ids = torch.arange(batch_size)[:, None].repeat(1, self.num_img_proposals) batch_ids = batch_ids.to(img_query_view.device) for key in proj_matrix: view_proj_matrix[key] = proj_matrix[key][batch_ids, img_query_view] return view_proj_matrix def construct_projection_matrix(self, img_metas, device): batch_size = len(img_metas) cam_ints = torch.zeros([batch_size, self.num_views, 4, 4], device=device) cam_ints[:, :, 3, 3] = 1 for sample_id in range(batch_size): cam_ints[sample_id, :, :3, :3] = torch.Tensor(img_metas[sample_id]['cam_intrinsic']).to(device) lidar2cam_rt = torch.zeros([batch_size, self.num_views, 4, 4], device=device) lidar2cam_rt[:, :, 3, 3] = 1 for sample_id in range(batch_size): lidar2cam_rt[sample_id, :, :3, :3] = torch.Tensor(img_metas[sample_id]['lidar2cam_r']).to(device) lidar2cam_rt[sample_id, :, :3, 3] = torch.Tensor(img_metas[sample_id]['lidar2cam_t']).to(device) lidar2img_rt = torch.matmul(cam_ints, lidar2cam_rt) proj_matrix = {"cam_intrinsic": cam_ints, "lidar2cam_rt": lidar2cam_rt, "lidar2img_rt": lidar2img_rt} return proj_matrix def build_heatmap_LN(self, hidden_channel, bias, num_classes, layer_num=2, kernel_size=3): layers = [] for i in range(layer_num-1): layers.append(ConvLN( hidden_channel, hidden_channel, kernel_size=kernel_size, padding=(kernel_size-1)//2, )) layers.append(build_conv_layer( dict(type='Conv2d'), hidden_channel, num_classes, kernel_size=kernel_size, padding=(kernel_size-1)//2, bias=bias, )) return nn.Sequential(*layers) def build_heatmap(self, hidden_channel, bias, num_classes, layer_num=2, kernel_size=3): layers = [] for i in range(layer_num-1): layers.append(ConvModule( hidden_channel, hidden_channel, kernel_size=kernel_size, padding=(kernel_size-1)//2, bias=bias, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), )) layers.append(build_conv_layer( dict(type='Conv2d'), hidden_channel, num_classes, kernel_size=kernel_size, padding=(kernel_size-1)//2, bias=bias, )) return nn.Sequential(*layers) def generate_heatmap_deform(self, lidar_feat, img_feat, voxel_height, img_metas, lidar2img_rt, input_padding_mask=None): # img_feat [bs*num_view, C, img_h, img_w] # lidar_feat [BS, C, H, W] batch_size = lidar_feat.shape[0] H, W = lidar_feat.shape[2], lidar_feat.shape[3] voxel_height = voxel_height.view(batch_size, H*W) valid_height_mask = voxel_height > -50 level_start_index = [0] spatial_shapes = [] img_feats_flatten = [] for lvl in range(self.level_num): img_h_lvl, img_w_lvl = img_feat[lvl].shape[-2], img_feat[lvl].shape[-1] img_feat[lvl] = self.heatmap_img_proj(img_feat[lvl].permute(0, 2, 3, 1)).permute(0, 3, 1, 2) # img_feat[lvl] = self.heatmap_img_proj(img_feat[lvl]) img_feat[lvl] = img_feat[lvl].view(batch_size, self.num_views, self.hidden_channel, img_h_lvl, img_w_lvl) img_feat_flatten = img_feat[lvl].view(batch_size, self.num_views, self.hidden_channel, img_h_lvl*img_w_lvl) img_feats_flatten.append(img_feat_flatten) level_start_index.append(level_start_index[-1] + img_h_lvl * img_w_lvl) spatial_shapes.append([img_h_lvl, img_w_lvl]) level_start_index = level_start_index[:-1] level_start_index = torch.LongTensor(level_start_index).to(lidar_feat.device) spatial_shapes = torch.LongTensor(spatial_shapes).to(lidar_feat.device) img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)] normal_img_feats_pos_stack = self.normal_img_feats_pos_stack # [1, h*w (sum), 2] lidar_feat = self.heatmap_pts_proj(lidar_feat.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) # lidar_feat = self.heatmap_pts_proj(lidar_feat) lidar_feat_flatten = lidar_feat.reshape(batch_size, self.hidden_channel, H*W) # [bs, C, H*W] lidar_feat_output = torch.zeros(batch_size, self.hidden_channel, H*W).to(lidar_feat.device) lidar_feat_count = torch.zeros(batch_size, 1, H*W).to(lidar_feat.device) bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) query_pos_realmetric = bev_pos.permute(0, 2, 1) * self.test_cfg['out_size_factor'] * \ self.test_cfg['voxel_size'][0] + self.test_cfg['pc_range'][0] # (bs, 2, H*W) query_pos_3d = torch.cat([query_pos_realmetric, voxel_height[:, None]], dim=1) # (bs, 3, H*W) points_4d = torch.cat([query_pos_3d, torch.ones_like(query_pos_3d[:, :1])], dim=1).permute(0, 2, 1) # (bs, H*W, 4) points_2d = torch.matmul(points_4d[:, None], lidar2img_rt.transpose(-1, -2)) # (bs, num_view, H*W, 4) points_2d[..., 2] = torch.clamp(points_2d[..., 2], min=1e-5) points_2d[..., :2] = points_2d[..., :2] / points_2d[..., 2:3] / self.out_size_factor_img if 'valid_shape' in img_metas[0]: valid_shape = [] for sample_idx in range(batch_size): sample_valid_shape = img_metas[sample_idx]['valid_shape'] / self.out_size_factor_img valid_shape.append(sample_valid_shape) valid_shape = np.array(valid_shape) valid_img_w = valid_shape[..., 0] valid_img_h = valid_shape[..., 1] else: valid_img_w = np.full([batch_size, self.num_views], img_feat[0].shape[-1]) valid_img_h = np.full([batch_size, self.num_views], img_feat[0].shape[-2]) valid_img_w = torch.from_numpy(valid_img_w).to(points_2d.device) valid_img_h = torch.from_numpy(valid_img_h).to(points_2d.device) img_h, img_w = img_feat[0].shape[-2], img_feat[0].shape[-1] center_xs = points_2d[..., 0] # (bs, num_view, H*W) center_ys = points_2d[..., 1] on_the_image = (center_xs >= 0) & (center_xs < valid_img_w[..., None]) & (center_ys >= 0) & \ (center_ys < valid_img_h[..., None]) & valid_height_mask[:, None] # [bs, num_view, H*W] depth = points_2d[..., 2] # [bs, num_view, H*W] depth = torch.log(depth) for sample_idx in range(batch_size): on_the_image_sample = on_the_image[sample_idx] # [num_view, H*W] bincount = torch.sum(on_the_image_sample, dim=1) max_len = torch.max(bincount) sample_query_feature = torch.zeros([self.num_views, self.hidden_channel, max_len], device=points_2d.device) sample_query_pos = torch.zeros([self.num_views, max_len, 3], device=points_2d.device) sample_reference_points = torch.zeros([self.num_views, max_len, 2], device=points_2d.device) sample_padding_mask = torch.zeros([self.num_views, max_len], device=points_2d.device, dtype=torch.bool) for view_idx in range(self.num_views): on_the_image_view = on_the_image_sample[view_idx] center_xs_view = center_xs[sample_idx, view_idx, on_the_image_view] # [N, ] center_ys_view = center_ys[sample_idx, view_idx, on_the_image_view] # [N, ] reference_points = torch.stack([center_xs_view / img_w, center_ys_view / img_h], dim=-1) # [N, 2] view_count = bincount[view_idx] sample_reference_points[view_idx, :view_count] = reference_points sample_query_feature[view_idx, :, :view_count] = lidar_feat_flatten[sample_idx, :, on_the_image_view] sample_query_pos[view_idx, :view_count, 2] = depth[sample_idx, view_idx, on_the_image_view] sample_padding_mask[view_idx, view_count:] = True sample_centers_normal = sample_reference_points * 2 - 1 sample_query_img_feat = [] for lvl in range(self.level_num): img_feat_lvl = img_feat[lvl][sample_idx] img_feat_lvl = F.grid_sample(img_feat_lvl, sample_centers_normal[:, None], mode='bilinear', padding_mode="border", align_corners=False) img_feat_lvl = img_feat_lvl[:, :, 0] sample_query_img_feat.append(img_feat_lvl) sample_query_img_feat = torch.stack(sample_query_img_feat, dim=0) sample_query_img_feat = torch.max(sample_query_img_feat, dim=0)[0] # [num_view, C, max_len] sample_query_feature = sample_query_feature + sample_query_img_feat sample_query_pos[..., :2] = inverse_sigmoid(sample_reference_points) sample_reference_points = sample_reference_points[:, :, None].repeat(1, 1, self.level_num, 1) if batch_size == 1: # whether it is doing evaluation or training if input_padding_mask is None: sample_input_padding_mask = None else: sample_input_padding_mask = input_padding_mask[sample_idx:sample_idx+1] output = self.cross_heatmap_decoder( sample_query_feature, img_feats_stack[sample_idx], sample_query_pos, normal_img_feats_pos_stack.repeat(self.num_views, 1, 1), reference_points=sample_reference_points, level_start_index=level_start_index, spatial_shapes=spatial_shapes, query_padding_mask=sample_padding_mask, input_padding_mask=sample_input_padding_mask ) else: output = [] for view_idx in range(self.num_views): view_query_feature = sample_query_feature[view_idx, :, torch.logical_not(sample_padding_mask[view_idx])] view_query_pos = sample_query_pos[view_idx, torch.logical_not(sample_padding_mask[view_idx])] view_reference_points = sample_reference_points[view_idx, torch.logical_not(sample_padding_mask[view_idx])] if input_padding_mask is None: view_input_padding_mask = None else: view_input_padding_mask = input_padding_mask[sample_idx, view_idx, None] output_item = self.cross_heatmap_decoder( view_query_feature[None], img_feats_stack[sample_idx, view_idx, None], view_query_pos[None], normal_img_feats_pos_stack, reference_points=view_reference_points[None], level_start_index=level_start_index, spatial_shapes=spatial_shapes, input_padding_mask=view_input_padding_mask ) output_item_pad = torch.zeros([output_item.shape[1], sample_padding_mask.shape[1]]).type_as(output_item) output_item_pad[:, torch.logical_not(sample_padding_mask[view_idx])] = output_item[0] output.append(output_item_pad) output = torch.stack(output, dim=0) for view_idx in range(self.num_views): view_count = bincount[view_idx] on_the_image_view = on_the_image_sample[view_idx] overlap_mask = lidar_feat_count[sample_idx, 0, on_the_image_view] > 0 output_view = output[view_idx, :, :view_count] nonoverlap_mask = torch.logical_not(overlap_mask) lidar_feat_output_view = lidar_feat_output[sample_idx, :, on_the_image_view] lidar_feat_output_view[:, overlap_mask] = torch.maximum(lidar_feat_output_view[:, overlap_mask], output_view[:, overlap_mask]) lidar_feat_output_view[:, nonoverlap_mask] = output_view[:, nonoverlap_mask] lidar_feat_output[sample_idx, :, on_the_image_view] = lidar_feat_output_view lidar_feat_count[sample_idx, :, on_the_image_view] += 1 lidar_feat_output = lidar_feat_output.reshape(batch_size, lidar_feat_output.shape[1], H, W) # lidar_feat_output = self.reduce_conv(lidar_feat_output) lidar_feat_count = lidar_feat_count.reshape(batch_size, 1, H, W) lidar_feat_flag = torch.where(lidar_feat_count>0, torch.ones_like(lidar_feat_count), torch.zeros_like(lidar_feat_count)) lidar_feat_output = lidar_feat_output + (1 - lidar_feat_flag) * lidar_feat lidar_feat_output = torch.cat([lidar_feat_output, lidar_feat_flag], dim=1) lidar_feat_output = self.reduce_conv(lidar_feat_output) heatmap_output = self.cross_heatmap_head(lidar_feat_output.contiguous()) return heatmap_output def generate_heatmap(self, lidar_feat, min_voxel_height, max_voxel_height, batch_size, img_metas, lidar2img_rt, img_feat=None, input_padding_mask=None): dense_heatmap = self.heatmap_head(lidar_feat) # [BS, num_class, H, W] if img_feat is None: heatmap = dense_heatmap.detach().sigmoid() # [BS, num_class, H, W] else: voxel_height = (min_voxel_height + max_voxel_height) / 2 dense_heatmap_cross = self.generate_heatmap_deform(lidar_feat, img_feat, voxel_height, img_metas, lidar2img_rt, input_padding_mask) if self.cross_only: heatmap = dense_heatmap_cross.detach().sigmoid() else: heatmap = (dense_heatmap.detach().sigmoid() + dense_heatmap_cross.detach().sigmoid()) / 2 dense_heatmap = dense_heatmap_cross padding = self.nms_kernel_size // 2 local_max = torch.zeros_like(heatmap) # equals to nms radius = voxel_size * out_size_factor * kenel_size local_max_inner = F.max_pool2d(heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0) local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner ## for Pedestrian & Traffic_cone in nuScenes if self.test_cfg['dataset'] == 'nuScenes': local_max[:, 8, ] = F.max_pool2d(heatmap[:, 8], kernel_size=1, stride=1, padding=0) local_max[:, 9, ] = F.max_pool2d(heatmap[:, 9], kernel_size=1, stride=1, padding=0) elif self.test_cfg['dataset'] == 'Waymo': # for Pedestrian & Cyclist in Waymo local_max[:, 1, ] = F.max_pool2d(heatmap[:, 1], kernel_size=1, stride=1, padding=0) local_max[:, 2, ] = F.max_pool2d(heatmap[:, 2], kernel_size=1, stride=1, padding=0) heatmap = heatmap * (heatmap == local_max) # [BS, num_class, H, W] heatmap = heatmap.view(batch_size, heatmap.shape[1], -1) # [BS, num_class, H*W] # top #num_proposals among all classes top_proposals = heatmap.reshape(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_proposals] # [BS, num_proposals] top_proposals_class = top_proposals // heatmap.shape[-1] # [BS, num_proposals] top_proposals_index = top_proposals % heatmap.shape[-1] # [BS, num_proposals] return heatmap, dense_heatmap, top_proposals_class, top_proposals_index def generate_heatmap_img(self, img_feats, batch_size): img_dense_heatmaps = [] img_heatmaps = [] for lvl in range(self.level_num): # img_dense_heatmap = self.img_heatmap_head(img_feats[lvl]) # [BS*num_view, num_class, h, w] img_dense_heatmap = self.img_heatmap_head[lvl](img_feats[lvl]) # [BS*num_view, num_class, h, w] img_heatmap = img_dense_heatmap.detach().sigmoid() # [BS*num_view, num_class, h, w] padding = self.img_nms_kernel_size // 2 local_max = torch.zeros_like(img_heatmap) # equals to nms radius = voxel_size * out_size_factor * kenel_size local_max_inner = F.max_pool2d(img_heatmap, kernel_size=self.img_nms_kernel_size, stride=1, padding=0) local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner img_heatmap = img_heatmap * (img_heatmap == local_max) # [BS*num_view, num_class, h, w] img_heatmap = img_heatmap.view(batch_size, self.num_views, img_heatmap.shape[1], -1) # [BS, num_views, num_class, h*w] img_heatmap = img_heatmap.permute(0, 2, 1, 3) # [BS, num_class, num_views, h*w] img_heatmaps.append(img_heatmap) img_dense_heatmap = img_dense_heatmap.view(batch_size, self.num_views, img_dense_heatmap.shape[1], img_dense_heatmap.shape[2], img_dense_heatmap.shape[3]) # [BS, num_views, num_class, h, w] img_dense_heatmap = img_dense_heatmap.permute(0, 2, 1, 3, 4) # [BS, num_class, num_views, h, w] img_dense_heatmap = img_dense_heatmap.view(batch_size, self.num_classes, self.num_views, img_dense_heatmap.shape[-2]*img_dense_heatmap.shape[-1]) img_dense_heatmaps.append(img_dense_heatmap) img_heatmap_stack = torch.cat(img_heatmaps, dim=3) # [BS, num_class, num_views, h*w (sum)] # top #num_proposals among all classes top_proposals = img_heatmap_stack.view(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_img_proposals] # [BS, num_proposals] top_proposals_class = top_proposals // (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) # [BS, num_proposals] top_proposals_view_index = top_proposals % (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) // img_heatmap_stack.shape[-1] # [BS, num_proposals] top_proposals_pos_index = top_proposals % img_heatmap_stack.shape[-1] # [BS, num_proposals] top_proposals_index = top_proposals % (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) # [BS, num_proposals] img_heatmap_stack = img_heatmap_stack.contiguous().view(batch_size, img_heatmap_stack.shape[1], -1) img_dense_heatmaps_stack = torch.cat(img_dense_heatmaps, dim=-1) return img_heatmap_stack, img_dense_heatmaps_stack, top_proposals_class, top_proposals_index, top_proposals_view_index, top_proposals_pos_index def get_targets(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible, gt_bboxes_lidar_view, preds_dict, img_metas): """Generate training targets. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. preds_dicts (tuple of dict): first index by layer (default 1) Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [BS, num_proposals] - torch.Tensor: classification weights (mask) [BS, num_proposals] - torch.Tensor: regression target. [BS, num_proposals, 8] - torch.Tensor: regression weights. [BS, num_proposals, 8] """ # change preds_dict into list of dict (index by batch_id) # preds_dict[0]['center'].shape [bs, 3, num_proposal] list_of_pred_dict = [] for batch_idx in range(len(gt_bboxes_3d)): pred_dict = {} for key in preds_dict[0].keys(): pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1] list_of_pred_dict.append(pred_dict) assert len(gt_bboxes_3d) == len(list_of_pred_dict) res_tuple = multi_apply(self.get_targets_single, gt_bboxes_3d, gt_labels_3d, gt_visible, list_of_pred_dict, np.arange(len(gt_labels_3d))) labels = torch.cat(res_tuple[0], dim=0) label_weights = torch.cat(res_tuple[1], dim=0) bbox_targets = torch.cat(res_tuple[2], dim=0) bbox_weights = torch.cat(res_tuple[3], dim=0) ious = torch.cat(res_tuple[4], dim=0) num_pos_layer = np.concatenate(res_tuple[5], axis=0) # [BS, num_layer] # matched_ious = np.mean(res_tuple[6]) matched_ious = torch.cat(res_tuple[6], dim=0) res_tuple_2d = multi_apply(self.get_targets_single_2d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_bboxes_lidar_view, list_of_pred_dict, img_metas, np.arange(len(gt_bboxes))) labels_2d = torch.cat(res_tuple_2d[0], dim=0) label_weights_2d = torch.cat(res_tuple_2d[1], dim=0) bbox_targets_2d = torch.cat(res_tuple_2d[2], dim=0) bbox_weights_2d = torch.cat(res_tuple_2d[3], dim=0) ious_2d = torch.cat(res_tuple_2d[4], dim=0) num_pos_layer_2d = np.concatenate(res_tuple_2d[5], axis=0) # [BS, num_layer] matched_ious_2d = torch.cat(res_tuple_2d[6], dim=0) if self.view_transform: res_tuple_view = multi_apply(self.get_targets_single_view, gt_bboxes_3d, gt_labels_3d, gt_visible, list_of_pred_dict, np.arange(len(gt_bboxes))) labels_view = torch.cat(res_tuple_view[0], dim=0) label_weights_view = torch.cat(res_tuple_view[1], dim=0) bbox_targets_view = torch.cat(res_tuple_view[2], dim=0) bbox_weights_view = torch.cat(res_tuple_view[3], dim=0) ious_view = torch.cat(res_tuple_view[4], dim=0) num_pos_layer_view = np.concatenate(res_tuple_view[5], axis=0) # [BS, num_layer] matched_ious_view = torch.cat(res_tuple_view[6], dim=0) if self.initialize_by_heatmap: heatmap = torch.cat(res_tuple[7], dim=0) heatmap_2d = torch.cat(res_tuple_2d[7], dim=0) if self.view_transform: return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \ matched_ious_2d, heatmap_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, \ ious_view, num_pos_layer_view, matched_ious_view else: return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \ matched_ious_2d, heatmap_2d else: if self.view_transform: return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d, \ labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, num_pos_layer_view, \ matched_ious_view else: return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d, def get_targets_single_2d(self, gt_bboxes, gt_labels, gt_centers_2d, gt_bboxes_cam_view, gt_bboxes_lidar_view, preds_dict, img_metas, batch_idx): num_proposals = preds_dict['cls'].shape[-1] loc_cam_3d = copy.deepcopy(preds_dict['loc_cam_3d'].detach()) dim = copy.deepcopy(preds_dict['dim_2d'].detach()) rot = copy.deepcopy(preds_dict['rot_2d'].detach()) if 'vel_2d' in preds_dict.keys(): vel = copy.deepcopy(preds_dict['vel_2d'].detach()) else: vel = None view = copy.deepcopy(preds_dict['view'].detach())[0] # [num_proposals, ] score = copy.deepcopy(preds_dict['cls'].detach()) bboxes_dict = self.bbox_2d_coder.decode(score, rot, dim, loc_cam_3d, vel) bboxes_3d_tensor = bboxes_dict[0]['bboxes'] gt_bboxes_3d_tensor = gt_bboxes_cam_view.tensor.to(score.device) gt_bboxes_lidar_view_tensor = gt_bboxes_lidar_view.tensor.to(score.device) assert gt_bboxes_lidar_view_tensor.shape[0] == gt_bboxes_3d_tensor.shape[0] img_shape = img_metas['pad_shape'] img_scale =[img_shape[1], img_shape[0], img_shape[1], img_shape[0]] img_scale = torch.Tensor(img_scale).to(score.device).unsqueeze(0) gt_centers_2d = gt_centers_2d.float() normal_gt_centers = gt_centers_2d[..., :2] / img_scale[..., :2] normal_gt_bboxes = gt_bboxes.float() / img_scale assign_result_list = [] for idx_layer in range(self.num_img_decoder_layers): bboxes_tensor_layer = bboxes_3d_tensor[idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals, :] # [num_proposals, 10] score_layer = score[..., idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals] # [1, num_class, num_proposal] view_layer = view[idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals] # [num_proposals] assign_result = self.bbox_assigner_2d.assign(bboxes_tensor_layer, gt_bboxes_3d_tensor, gt_labels, score_layer, view_layer, self.train_cfg) assign_result_list.append(assign_result) # combine assign result of each layer assign_result_ensemble = AssignResult( num_gts=sum([res.num_gts for res in assign_result_list]), gt_inds=torch.cat([res.gt_inds for res in assign_result_list]), max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]), labels=torch.cat([res.labels for res in assign_result_list]), ) sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_3d_tensor, gt_bboxes_3d_tensor) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds assert len(pos_inds) + len(neg_inds) == num_proposals start = 0 pos_num_layers = [] for idx_layer in range(self.num_img_decoder_layers): layer_num_proposal = self.num_img_proposals layer_mask = torch.logical_and(pos_inds>=start, pos_inds 0: # bbox_targets[pos_inds, :] = sampling_result.pos_gt_bboxes bbox_weights[pos_inds, :] = 1.0 pos_gt_bboxes = sampling_result.pos_gt_bboxes pos_bbox_targets = self.bbox_2d_coder.encode(pos_gt_bboxes) bbox_targets[pos_inds, :pos_bbox_targets.shape[1]] = pos_bbox_targets view_targets[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds, 1] if gt_labels is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds, 0] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight center_targets[pos_inds, :] = normal_gt_centers[sampling_result.pos_assigned_gt_inds, :2] center_weights[pos_inds] = 1.0 depth = gt_centers_2d[sampling_result.pos_assigned_gt_inds, 2] depth_labels[pos_inds] = depth depth_weights[pos_inds] = 1 view_mask_ignore = view_targets != view bbox_weights[view_mask_ignore, :] = 0 label_weights[view_mask_ignore] = 0 if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 bbox_targets[:, :2] = center_targets bbox_targets[:, 2] = depth_labels # # compute dense heatmap targets if self.initialize_by_heatmap: device = labels.device feature_map_size = (img_shape[1] // self.out_size_factor_img, img_shape[0] // self.out_size_factor_img) w, h = feature_map_size heatmaps = [] for lvl in range(self.level_num): heatmaps.append(score.new_zeros(self.num_classes, self.num_views, h, w)) h = h // 2 w = w // 2 for idx in range(len(gt_bboxes)): width = gt_bboxes[idx][2] length = gt_bboxes[idx][3] max_l = max(length, width) width = width / self.out_size_factor_img length = length / self.out_size_factor_img view_id = gt_labels[idx][1] if width > 0 and length > 0: radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap_2d']) radius = max(self.train_cfg['min_radius'], radius) radius = min(self.train_cfg['max_radius'], radius) x, y = gt_centers_2d[idx][0], gt_centers_2d[idx][1] # x, y = gt_bboxes[idx][0], gt_bboxes[idx][1] coor_x = x / self.out_size_factor_img coor_y = y / self.out_size_factor_img center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) if self.level_num == 4: if max_l < 48: lvl = 0 elif max_l < 96: lvl = 1 center = center / 2 radius = radius / 2 elif max_l < 192: lvl = 2 center = center / 4 radius = radius / 4 else: lvl = 3 center = center / 8 radius = radius / 8 elif self.level_num == 3: if max_l < 48: lvl = 0 elif max_l < 96: lvl = 1 center = center / 2 radius = radius / 2 else: lvl = 2 center = center / 4 radius = radius / 4 elif self.level_num == 2: if max_l < 96: lvl = 0 else: lvl = 1 center = center / 2 radius = radius / 2 else: assert self.level_num == 1 lvl = 0 center_int = center.to(torch.int32) radius = int(radius) draw_heatmap_gaussian(heatmaps[lvl][gt_labels[idx][0], view_id], center_int, radius) for lvl in range(self.level_num): heatmaps[lvl] = heatmaps[lvl].view(self.num_classes, self.num_views, heatmaps[lvl].shape[-2]*heatmaps[lvl].shape[-1]) heatmap = torch.cat(heatmaps, dim=-1) matched_ious = torch.ones_like(ious) * -1 matched_ious[pos_inds] = ious[pos_inds] return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], heatmap[None], labels_lidar[None], label_lidar_weights[None], bbox_lidar_targets[None], bbox_lidar_weights[None], ious_lidar[None] else: matched_ious = torch.ones_like(ious) * -1 matched_ious[pos_inds] = ious[pos_inds] return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], labels_lidar[None], label_lidar_weights[None], bbox_lidar_targets[None], bbox_lidar_weights[None], ious_lidar[None] def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, gt_visible, preds_dict, batch_idx): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. gt_bboxes (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes 2d. gt_labels (torch.Tensor): Labels of boxes 2d. preds_dict (dict): dict of prediction result for a single sample Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [1, num_proposals] - torch.Tensor: classification weights (mask) [1, num_proposals] - torch.Tensor: regression target. [1, num_proposals, 8] - torch.Tensor: regression weights. [1, num_proposals, 8] - torch.Tensor: iou target. [1, num_proposals] - int: number of positive proposals """ num_proposals = preds_dict['center'].shape[-1] # get pred boxes, carefully ! donot change the network outputs score = copy.deepcopy(preds_dict['heatmap'].detach()) center = copy.deepcopy(preds_dict['center'].detach()) height = copy.deepcopy(preds_dict['height'].detach()) dim = copy.deepcopy(preds_dict['dim'].detach()) rot = copy.deepcopy(preds_dict['rot'].detach()) if 'vel' in preds_dict.keys(): vel = copy.deepcopy(preds_dict['vel'].detach()) else: vel = None boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox bboxes_tensor = boxes_dict[0]['bboxes'] gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device) num_fusion_decoder_layers = self.num_fusion_decoder_layers num_layer = self.num_pts_decoder_layers + num_fusion_decoder_layers start = 0 pos_inds = [] neg_inds = [] pos_gt_bboxes = [] pos_gt_labels = [] ious = [] for idx_layer in range(num_layer): layer_num_proposal = self.get_layer_num_proposal(idx_layer) bboxes_tensor_layer = bboxes_tensor[start:start + layer_num_proposal, :] score_layer = score[..., start:start + layer_num_proposal] gt_bboxes_tensor_layer = gt_bboxes_tensor gt_labels_3d_layer = gt_labels_3d if self.train_cfg.assigner.type == 'HungarianAssigner3D': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor_layer, gt_labels_3d_layer, score_layer, self.train_cfg) elif self.train_cfg.assigner.type == 'HeuristicAssigner': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor_layer, None, gt_labels_3d_layer, self.query_labels[batch_idx]) else: raise NotImplementedError # assign_result_list.append(assign_result) sampling_result_layer = self.bbox_sampler.sample(assign_result, bboxes_tensor_layer, gt_bboxes_tensor_layer) pos_inds_layer = sampling_result_layer.pos_inds + start neg_inds_layer = sampling_result_layer.neg_inds + start pos_inds.append(pos_inds_layer) neg_inds.append(neg_inds_layer) pos_gt_bboxes_layer = sampling_result_layer.pos_gt_bboxes pos_gt_labels_layer = gt_labels_3d_layer[sampling_result_layer.pos_assigned_gt_inds] pos_gt_bboxes.append(pos_gt_bboxes_layer) pos_gt_labels.append(pos_gt_labels_layer) ious_layer = assign_result.max_overlaps ious.append(ious_layer) start += layer_num_proposal pos_inds = torch.cat(pos_inds) neg_inds = torch.cat(neg_inds) pos_gt_bboxes = torch.cat(pos_gt_bboxes, dim=0) pos_gt_labels = torch.cat(pos_gt_labels, dim=0) assert len(pos_inds) + len(neg_inds) == num_proposals start = 0 pos_num_layers = [] for idx_layer in range(num_layer): layer_num_proposal = self.get_layer_num_proposal(idx_layer) count = pos_inds[torch.logical_and(pos_inds>=start, pos_inds 0: label_weights[neg_inds] = 1.0 if len(pos_inds) > 0: pos_bbox_targets = self.bbox_coder.encode(pos_gt_bboxes) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels_3d is None: labels[pos_inds] = 1 else: labels[pos_inds] = pos_gt_labels if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight # # compute dense heatmap targets if self.initialize_by_heatmap: device = labels.device gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device) grid_size = torch.tensor(self.train_cfg['grid_size']) pc_range = torch.tensor(self.train_cfg['point_cloud_range']) voxel_size = torch.tensor(self.train_cfg['voxel_size']) feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len] heatmap = gt_bboxes_3d.new_zeros(self.num_classes, feature_map_size[1], feature_map_size[0]) for idx in range(len(gt_bboxes_3d)): width = gt_bboxes_3d[idx][3] length = gt_bboxes_3d[idx][4] width = width / voxel_size[0] / self.train_cfg['out_size_factor'] length = length / voxel_size[1] / self.train_cfg['out_size_factor'] if width > 0 and length > 0: radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap']) radius = max(self.train_cfg['min_radius'], int(radius)) x, y = gt_bboxes_3d[idx][0], gt_bboxes_3d[idx][1] coor_x = (x - pc_range[0]) / voxel_size[0] / self.train_cfg['out_size_factor'] coor_y = (y - pc_range[1]) / voxel_size[1] / self.train_cfg['out_size_factor'] center_img = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) center_int = center_img.to(torch.int32) draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius) matched_ious = torch.ones_like(ious) * -1 matched_ious[pos_inds] = ious[pos_inds] return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], heatmap[None] else: matched_ious = torch.ones_like(ious) * -1 matched_ious[pos_inds] = ious[pos_inds] return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None] def get_targets_single_view(self, gt_bboxes_3d, gt_labels_3d, gt_visible_3d, preds_dict, batch_idx): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. gt_bboxes (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes 2d. gt_labels (torch.Tensor): Labels of boxes 2d. preds_dict (dict): dict of prediction result for a single sample Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [1, num_proposals] - torch.Tensor: classification weights (mask) [1, num_proposals] - torch.Tensor: regression target. [1, num_proposals, 8] - torch.Tensor: regression weights. [1, num_proposals, 8] - torch.Tensor: iou target. [1, num_proposals] - int: number of positive proposals """ num_proposals = preds_dict['center_view'].shape[-1] # get pred boxes, carefully ! donot change the network outputs score = copy.deepcopy(preds_dict['heatmap_view'].detach()) center = copy.deepcopy(preds_dict['center_view'].detach()) height = copy.deepcopy(preds_dict['height_view'].detach()) dim = copy.deepcopy(preds_dict['dim_view'].detach()) rot = copy.deepcopy(preds_dict['rot_view'].detach()) if 'vel_view' in preds_dict.keys(): vel = copy.deepcopy(preds_dict['vel_view'].detach()) else: vel = None boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox bboxes_tensor = boxes_dict[0]['bboxes'] assert gt_visible_3d.shape[0] == gt_bboxes_3d.tensor.shape[0] == gt_labels_3d.shape[0] gt_mask = gt_visible_3d == 1 gt_bboxes_3d = gt_bboxes_3d[gt_mask] gt_labels_3d = gt_labels_3d[gt_mask] gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device) num_layer = 1 assign_result_list = [] start = 0 for idx_layer in range(num_layer): layer_num_proposal = self.get_layer_num_proposal(idx_layer) bboxes_tensor_layer = bboxes_tensor[start:start + layer_num_proposal, :] score_layer = score[..., start:start + layer_num_proposal] start += layer_num_proposal if self.train_cfg.assigner.type == 'HungarianAssigner3D': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, gt_labels_3d, score_layer, self.train_cfg) elif self.train_cfg.assigner.type == 'HeuristicAssigner': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, None, gt_labels_3d, self.query_labels[batch_idx]) else: raise NotImplementedError assign_result_list.append(assign_result) # combine assign result of each layer assign_result_ensemble = AssignResult( num_gts=sum([res.num_gts for res in assign_result_list]), gt_inds=torch.cat([res.gt_inds for res in assign_result_list]), max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]), labels=torch.cat([res.labels for res in assign_result_list]), ) sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_tensor, gt_bboxes_tensor) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds assert len(pos_inds) + len(neg_inds) == num_proposals start = 0 pos_num_layers = [] for idx_layer in range(num_layer): layer_num_proposal = self.get_layer_num_proposal(idx_layer) count = pos_inds[torch.logical_and(pos_inds>=start, pos_inds 0: pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_gt_bboxes) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels_3d is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels_3d[sampling_result.pos_assigned_gt_inds] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 matched_ious = torch.ones_like(ious) * -1 matched_ious[pos_inds] = ious[pos_inds] return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None] @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, img_metas, preds_dicts, **kwargs): """Loss function for CenterHead. Args: **The followings are in the same order of "gt_bboxes_3d" :** gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. gt_visible_3d (list[torch.Tensor]): visibility of LiDAR boxes for camera **The followings are in the same order of "gt_bboxes":** gt_bboxes (list[torch.Tensor]): Ground truth of projected 2d boxes. (one LiDAR box may be projected to zero/one/two camera views, so "gt_bboxes" has different number with "gt_bboxes_3d") gt_labels (list[torch.Tensor]): Labels and camera view ids of projected 2d boxes. gt_pts_centers_view (list[torch.Tensor]): 3D center of each boxes in the LiDAR coordinate gt_img_centers_view (list[torch.Tensor]): 3D center of each boxes in the corresponding camera coordinate gt_bboxes_cam_view (list[:obj:`CameraInstance3DBoxes`]): ground truth boxes in the corresponding camera coordinate gt_bboxes_lidar_view (list[:obj:`LiDARInstance3DBoxes`]): ground truth boxes in the LiDAR coordinate preds_dicts (list[list[dict]]): Output of forward function. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ if self.initialize_by_heatmap: if self.view_transform: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \ matched_ious_2d, heatmap_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, \ num_pos_layer_view, matched_ious_view = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, preds_dicts[0], img_metas) else: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \ matched_ious_2d, heatmap_2d = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, preds_dicts[0], img_metas) else: if self.view_transform: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \ matched_ious_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, \ num_pos_layer_view, matched_ious_view = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, preds_dicts[0], img_metas) else: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \ labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d = \ self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, preds_dicts[0], img_metas) # if hasattr(self, 'on_the_image_mask'): preds_dict = preds_dicts[0][0] loss_dict = dict() if self.initialize_by_heatmap: # compute heatmap loss loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict['dense_heatmap']), heatmap, avg_factor=max(heatmap.eq(1).float().sum().item(), 1)) if 'valid_shape' in img_metas[0].keys(): bs = heatmap_2d.shape[0] num_view = heatmap_2d.shape[2] # heatmap_2d_weight = torch.zeros_like(heatmap_2d) heatmaps_2d_weight = [] img_w, img_h = self.test_cfg['img_scale'] img_w = img_w // self.out_size_factor_img img_h = img_h // self.out_size_factor_img for lvl in range(self.level_num): heatmap_2d_weight = torch.zeros(heatmap_2d.shape[0], self.num_classes, self.num_views, img_h, img_w).to(heatmap_2d.device) heatmaps_2d_weight.append(heatmap_2d_weight) img_h = img_h // 2 img_w = img_w // 2 for sample_idx in range(bs): for view_idx in range(num_view): valid_shape = img_metas[sample_idx]['valid_shape'][view_idx] / self.out_size_factor_img red_width = int(valid_shape[0]) red_height = int(valid_shape[1]) for lvl in range(self.level_num): heatmaps_2d_weight[lvl][sample_idx, :, view_idx, :red_height, :red_width] = 1 red_width = red_width // 2 red_height = red_height // 2 for lvl in range(self.level_num): heatmaps_2d_weight[lvl] = heatmaps_2d_weight[lvl].view(heatmaps_2d_weight[lvl].shape[0], self.num_classes, self.num_views, heatmaps_2d_weight[lvl].shape[-2]*heatmaps_2d_weight[lvl].shape[-1]) heatmap_2d_weight = torch.cat(heatmaps_2d_weight, dim=-1) loss_heatmap_2d = self.loss_heatmap_2d(clip_sigmoid(preds_dict['img_dense_heatmap']), heatmap_2d, weight=heatmap_2d_weight, avg_factor=max(heatmap_2d.eq(1).float().sum().item(), 1)) else: loss_heatmap_2d = self.loss_heatmap_2d(clip_sigmoid(preds_dict['img_dense_heatmap']), heatmap_2d, avg_factor=max(heatmap_2d.eq(1).float().sum().item(), 1)) loss_dict['loss_heatmap'] = loss_heatmap loss_dict['loss_heatmap_2d'] = loss_heatmap_2d # compute loss for each layer start = 0 num_pos_layer = np.sum(num_pos_layer, axis=0) num_pos_layer_2d = np.sum(num_pos_layer_2d, axis=0) if self.view_transform: num_pos_layer_view = np.sum(num_pos_layer_view, axis=0) num_fusion_decoder_layers = self.num_fusion_decoder_layers num_layer = self.num_pts_decoder_layers + num_fusion_decoder_layers for idx_layer in range(num_layer): layer_num_proposals = self.get_layer_num_proposal(idx_layer) if idx_layer < self.num_pts_decoder_layers: prefix = f'layer_pts_{idx_layer}' else: prefix = f'layer_fusion_{idx_layer-self.num_pts_decoder_layers}' layer_labels = labels[..., start:start + layer_num_proposals].reshape(-1) layer_label_weights = label_weights[..., start:start + layer_num_proposals].reshape(-1) layer_score = preds_dict['heatmap'][..., start:start + layer_num_proposals] layer_cls_score = layer_score.permute(0, 2, 1).reshape(-1, self.num_classes) layer_loss_cls = self.loss_cls(layer_cls_score, layer_labels, layer_label_weights, avg_factor=max(num_pos_layer[idx_layer], 1)) layer_center = preds_dict['center'][..., start:start + layer_num_proposals] layer_height = preds_dict['height'][..., start:start + layer_num_proposals] layer_rot = preds_dict['rot'][..., start:start + layer_num_proposals] layer_dim = preds_dict['dim'][..., start:start + layer_num_proposals] preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] if 'vel' in preds_dict.keys(): layer_vel = preds_dict['vel'][..., start:start + layer_num_proposals] preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot, layer_vel], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] code_weights = self.train_cfg.get('code_weights', None) layer_bbox_weights = bbox_weights[:, start:start + layer_num_proposals, :] layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(code_weights) layer_bbox_targets = bbox_targets[:, start:start + layer_num_proposals, :] layer_loss_bbox = self.loss_bbox(preds, layer_bbox_targets, layer_reg_weights, avg_factor=max(num_pos_layer[idx_layer], 1)) layer_match_ious = matched_ious[..., start:start + layer_num_proposals] layer_match_ious = torch.sum(layer_match_ious*(layer_match_ious>=0), dim=-1) / torch.sum(layer_match_ious>=0, dim=-1) layer_match_ious = torch.mean(layer_match_ious) start += layer_num_proposals loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox loss_dict[f'{prefix}_matched_ious'] = layer_match_ious start = 0 for idx_layer in range(self.num_img_decoder_layers): prefix = f'layer_img_{idx_layer}' layer_num_proposals = self.num_img_proposals layer_labels_2d = labels_2d[..., start:start + layer_num_proposals].reshape(-1) layer_label_weights_2d = label_weights_2d[..., start:start + layer_num_proposals].reshape(-1) layer_score_2d = preds_dict['cls'][..., start:start + layer_num_proposals] layer_cls_score_2d = layer_score_2d.permute(0, 2, 1).reshape(-1, self.num_classes) layer_loss_cls_2d = self.loss_cls(layer_cls_score_2d, layer_labels_2d, layer_label_weights_2d, avg_factor=max(num_pos_layer_2d[idx_layer], 1)) preds_2d_center = preds_dict['center_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal] preds_2d_depth = preds_dict['depth_2d'][..., start:start + layer_num_proposals] # [bs, 1, num_proposal] preds_2d_dim = preds_dict['dim_2d'][..., start:start + layer_num_proposals] # [bs, 3, num_proposal] preds_2d_rot = preds_dict['rot_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal] preds_2d_vel = preds_dict['vel_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal] preds_2d = torch.cat([preds_2d_center, preds_2d_depth[:, :1], preds_2d_dim, preds_2d_rot, preds_2d_vel], dim=1).permute(0, 2, 1) # [bs, num_proposal, 10] layer_bbox_targets_2d = bbox_targets_2d[:, start:start + layer_num_proposals, :preds_2d.shape[2]] layer_reg_weights_2d = bbox_weights_2d[:, start:start + layer_num_proposals, :preds_2d.shape[2]] code_weights = self.train_cfg.get('img_code_weights', None) layer_reg_weights_2d = layer_reg_weights_2d * layer_reg_weights_2d.new_tensor(code_weights) layer_loss_center_2d = self.loss_center_2d(preds_2d[...,:2], layer_bbox_targets_2d[...,:2], layer_reg_weights_2d[...,:2], avg_factor=max(num_pos_layer_2d[idx_layer], 1)) layer_loss_depth_2d = self.loss_bbox(preds_2d[...,2:3], layer_bbox_targets_2d[...,2:3], layer_reg_weights_2d[...,2:3], avg_factor=max(num_pos_layer_2d[idx_layer], 1)) layer_loss_dim_2d = self.loss_bbox(preds_2d[...,3:6], layer_bbox_targets_2d[...,3:6], layer_reg_weights_2d[...,3:6], avg_factor=max(num_pos_layer_2d[idx_layer], 1)) layer_loss_rot_2d = self.loss_bbox(preds_2d[...,6:8], layer_bbox_targets_2d[...,6:8], layer_reg_weights_2d[...,6:8], avg_factor=max(num_pos_layer_2d[idx_layer], 1)) layer_match_ious_2d = matched_ious_2d[..., start:start + layer_num_proposals] layer_match_ious_2d = torch.sum(layer_match_ious_2d*(layer_match_ious_2d>=0), dim=-1) / (torch.sum(layer_match_ious_2d>=0, dim=-1) + 1e-2) layer_match_ious_2d = torch.mean(layer_match_ious_2d) start += layer_num_proposals loss_dict[f'{prefix}_loss_cls_2d'] = layer_loss_cls_2d loss_dict[f'{prefix}_loss_center_2d'] = layer_loss_center_2d loss_dict[f'{prefix}_loss_depth_2d'] = layer_loss_depth_2d loss_dict[f'{prefix}_loss_dim_2d'] = layer_loss_dim_2d loss_dict[f'{prefix}_loss_rot_2d'] = layer_loss_rot_2d if preds_2d.shape[-1] > 8: layer_loss_vel_2d = self.loss_bbox(preds_2d[...,8:10], layer_bbox_targets_2d[...,8:10], layer_reg_weights_2d[...,8:10], avg_factor=max(num_pos_layer_2d[idx_layer], 1)) loss_dict[f'{prefix}_loss_vel_2d'] = layer_loss_vel_2d else: layer_loss_vel_2d = 0 loss_dict[f'{prefix}_matched_ious_2d'] = layer_match_ious_2d loss_dict[f'{prefix}_reg_bbox_2d'] = (layer_loss_center_2d+layer_loss_depth_2d+layer_loss_dim_2d+layer_loss_rot_2d+layer_loss_vel_2d).detach() if self.view_transform: layer_labels_view = labels_view.reshape(-1) layer_label_weights_view = label_weights_view.reshape(-1) layer_cls_score = preds_dict['heatmap_view'].permute(0, 2, 1).reshape(-1, self.num_classes) layer_loss_cls_view = self.loss_cls( layer_cls_score, layer_labels_view, layer_label_weights_view, avg_factor=max(num_pos_layer_view[0], 1) ) layer_center_view = preds_dict['center_view'] layer_height_view = preds_dict['height_view'] layer_rot_view = preds_dict['rot_view'] layer_dim_view = preds_dict['dim_view'] preds_view = torch.cat([layer_center_view, layer_height_view, layer_dim_view, layer_rot_view], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] if 'vel' in preds_dict.keys(): layer_vel_view = preds_dict['vel_view'] preds_view = torch.cat([layer_center_view, layer_height_view, layer_dim_view, layer_rot_view, layer_vel_view], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] code_weights = self.train_cfg.get('code_weights', None) layer_reg_weights_view = bbox_weights_view * bbox_weights_view.new_tensor(code_weights) layer_loss_bbox_view = self.loss_bbox(preds_view, bbox_targets_view, layer_reg_weights_view, avg_factor=max(num_pos_layer_view[0], 1)) layer_match_ious_view = matched_ious_view layer_match_ious_view = torch.sum(layer_match_ious_view * (layer_match_ious_view >= 0), dim=-1) / torch.sum( layer_match_ious_view >= 0, dim=-1) layer_match_ious_view = torch.mean(layer_match_ious_view) loss_dict['view_loss_cls'] = layer_loss_cls_view loss_dict['view_loss_bbox'] = layer_loss_bbox_view loss_dict['view_matched_ious'] = layer_match_ious_view return loss_dict def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False, for_roi=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. Returns: list[list[dict]]: Decoded bbox, scores and labels for each layer & each batch """ rets = [] for id, preds_dict in enumerate(preds_dicts): layer_num_proposal = self.num_proposals + self.num_img_proposals batch_size = preds_dict[0]['heatmap'].shape[0] batch_score_raw = preds_dict[0]['heatmap'][..., -layer_num_proposal:].sigmoid() one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1) query_heatmap_score = preds_dict[0]['query_heatmap_score'] * one_hot one_hot_img = F.one_hot(self.img_query_label, num_classes=self.num_classes).permute(0, 2, 1) img_query_label_decoder = torch.max(preds_dict[0]['cls'], dim=1)[1] one_hot_img_decoder = F.one_hot(img_query_label_decoder, num_classes=self.num_classes).permute(0, 2, 1) img_query_heatmap_score = preds_dict[0]['img_query_heatmap_score'] * one_hot_img * one_hot_img_decoder * 0.5 query_heatmap_score = torch.cat([query_heatmap_score, img_query_heatmap_score], dim=2) batch_score = batch_score_raw * query_heatmap_score batch_center = preds_dict[0]['center'][..., -layer_num_proposal:] batch_height = preds_dict[0]['height'][..., -layer_num_proposal:] batch_dim = preds_dict[0]['dim'][..., -layer_num_proposal:] batch_rot = preds_dict[0]['rot'][..., -layer_num_proposal:] batch_vel = None if 'vel' in preds_dict[0]: batch_vel = preds_dict[0]['vel'][..., -layer_num_proposal:] temp = self.bbox_coder.decode(batch_score, batch_rot, batch_dim, batch_center, batch_height, batch_vel, filter=True) if self.test_cfg['dataset'] == 'nuScenes': self.tasks = [ dict(num_class=1, class_names=['car'], indices=[0], radius=0.35), dict(num_class=1, class_names=['truck'], indices=[1], radius=0.35), dict(num_class=1, class_names=['construction_vehicle'], indices=[2], radius=0.35), dict(num_class=1, class_names=['bus'], indices=[3], radius=0.35), dict(num_class=1, class_names=['trailer'], indices=[4], radius=0.35), dict(num_class=1, class_names=['barrier'], indices=[5], radius=0.175), dict(num_class=1, class_names=['motorcycle'], indices=[6], radius=0.1), dict(num_class=1, class_names=['bicycle'], indices=[7], radius=-1), dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.1), dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.1), ] # self.tasks = [ # dict(num_class=8, class_names=[], indices=[0, 1, 2, 3, 4, 5, 6, 7], radius=-1), # dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.175), # dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.175), # ] elif self.test_cfg['dataset'] == 'Waymo': self.tasks = [ dict(num_class=1, class_names=['Car'], indices=[0], radius=0.7), dict(num_class=1, class_names=['Pedestrian'], indices=[1], radius=0.7), dict(num_class=1, class_names=['Cyclist'], indices=[2], radius=0.7), ] ret_layer = [] for i in range(batch_size): boxes3d = temp[i]['bboxes'] scores = temp[i]['scores'] labels = temp[i]['labels'] ## adopt circle nms for different categories if self.test_cfg['nms_type'] != None: keep_mask = torch.zeros_like(scores) for task in self.tasks: task_mask = torch.zeros_like(scores) for cls_idx in task['indices']: task_mask += labels == cls_idx task_mask = task_mask.bool() if task['radius'] > 0 and task_mask.sum() > 0: if self.test_cfg['nms_type'] == 'circle': boxes_for_nms = torch.cat([boxes3d[task_mask][:, :2], scores[:, None][task_mask]], dim=1) task_keep_indices = torch.tensor( circle_nms( boxes_for_nms.detach().cpu().numpy(), task['radius'], # 5, post_max_size=500 ) ) else: boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](boxes3d[task_mask][:, :7], 7).bev) top_scores = scores[task_mask] task_keep_indices = nms_gpu( boxes_for_nms, top_scores, thresh=task['radius'], # pre_maxsize=self.test_cfg['pre_maxsize'], # post_max_size=self.test_cfg['post_maxsize'], ) else: task_keep_indices = torch.arange(task_mask.sum()) if task_keep_indices.shape[0] != 0: keep_indices = torch.where(task_mask != 0)[0][task_keep_indices] keep_mask[keep_indices] = 1 keep_mask = keep_mask.bool() ret = dict(bboxes=boxes3d[keep_mask], scores=scores[keep_mask], labels=labels[keep_mask]) else: # no nms ret = dict(bboxes=boxes3d, scores=scores, labels=labels) ret_layer.append(ret) rets.append(ret_layer) assert len(rets) == 1 assert len(rets[0]) == 1 res = [[ img_metas[0]['box_type_3d'](rets[0][0]['bboxes'], box_dim=rets[0][0]['bboxes'].shape[-1]), rets[0][0]['scores'], rets[0][0]['labels'].int() ]] return res def get_layer_num_proposal(self, idx_layer): if idx_layer >= self.num_pts_decoder_layers: layer_num_proposal = self.num_proposals + self.num_img_proposals else: layer_num_proposal = self.num_proposals return layer_num_proposal ================================================ FILE: mmdet3d/models/dense_heads/ssd_3d_head.py ================================================ import torch from mmcv.ops.nms import batched_nms from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes, LiDARInstance3DBoxes, rotation_3d_in_axis) from mmdet3d.models.builder import build_loss from mmdet.core import multi_apply from mmdet.models import HEADS from .vote_head import VoteHead @HEADS.register_module() class SSD3DHead(VoteHead): r"""Bbox head of `3DSSD `_. Args: num_classes (int): The number of class. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. in_channels (int): The number of input feature channel. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. pred_layer_cfg (dict): Config of classfication and regression prediction layers. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. act_cfg (dict): Config of activation in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_res_loss (dict): Config of size residual regression loss. corner_loss (dict): Config of bbox corners regression loss. vote_loss (dict): Config of candidate points regression loss. """ def __init__(self, num_classes, bbox_coder, in_channels=256, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, pred_layer_cfg=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_res_loss=None, corner_loss=None, vote_loss=None): super(SSD3DHead, self).__init__( num_classes, bbox_coder, train_cfg=train_cfg, test_cfg=test_cfg, vote_module_cfg=vote_module_cfg, vote_aggregation_cfg=vote_aggregation_cfg, pred_layer_cfg=pred_layer_cfg, conv_cfg=conv_cfg, norm_cfg=norm_cfg, objectness_loss=objectness_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=None, size_res_loss=size_res_loss, semantic_loss=None) self.corner_loss = build_loss(corner_loss) self.vote_loss = build_loss(vote_loss) self.num_candidates = vote_module_cfg['num_points'] def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (1) return self.num_classes def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # Bbox classification and regression # (center residual (3), size regression (3) # heading class+residual (num_dir_bins*2)), return 3 + 3 + self.num_dir_bins * 2 def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. torch.Tensor: Indices of input points. """ seed_points = feat_dict['sa_xyz'][-1] seed_features = feat_dict['sa_features'][-1] seed_indices = feat_dict['sa_indices'][-1] return seed_points, seed_features, seed_indices @force_fp32(apply_to=('bbox_preds', )) def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of SSD3DHead. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of 3DSSD. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask, centerness_weights, box_loss_weights, heading_res_loss_weight) = targets # calculate centerness loss centerness_loss = self.objectness_loss( bbox_preds['obj_scores'].transpose(2, 1), centerness_targets, weight=centerness_weights) # calculate center loss center_loss = self.center_loss( bbox_preds['center_offset'], center_targets, weight=box_loss_weights.unsqueeze(-1)) # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class'].transpose(1, 2), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss dir_res_loss = self.dir_res_loss( bbox_preds['dir_res_norm'], dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins), weight=heading_res_loss_weight) # calculate size residual loss size_loss = self.size_res_loss( bbox_preds['size'], size_res_targets, weight=box_loss_weights.unsqueeze(-1)) # calculate corner loss one_hot_dir_class_targets = dir_class_targets.new_zeros( bbox_preds['dir_class'].shape) one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1), 1) pred_bbox3d = self.bbox_coder.decode( dict( center=bbox_preds['center'], dir_res=bbox_preds['dir_res'], dir_class=one_hot_dir_class_targets, size=bbox_preds['size'])) pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1]) pred_bbox3d = img_metas[0]['box_type_3d']( pred_bbox3d.clone(), box_dim=pred_bbox3d.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3) corner_loss = self.corner_loss( pred_corners3d, corner3d_targets.reshape(-1, 8, 3), weight=box_loss_weights.view(-1, 1, 1)) # calculate vote loss vote_loss = self.vote_loss( bbox_preds['vote_offset'].transpose(1, 2), vote_targets, weight=vote_mask.unsqueeze(-1)) losses = dict( centerness_loss=centerness_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_res_loss=size_loss, corner_loss=corner_loss, vote_loss=vote_loss) return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of ssd3d head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head. Returns: tuple[torch.Tensor]: Targets of ssd3d head. """ # find empty example for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] seed_points = [ bbox_preds['seed_points'][i, :self.num_candidates].detach() for i in range(len(gt_labels_3d)) ] (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) = multi_apply( self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points, seed_points) center_targets = torch.stack(center_targets) positive_mask = torch.stack(positive_mask) negative_mask = torch.stack(negative_mask) dir_class_targets = torch.stack(dir_class_targets) dir_res_targets = torch.stack(dir_res_targets) size_res_targets = torch.stack(size_res_targets) mask_targets = torch.stack(mask_targets) centerness_targets = torch.stack(centerness_targets).detach() corner3d_targets = torch.stack(corner3d_targets) vote_targets = torch.stack(vote_targets) vote_mask = torch.stack(vote_mask) center_targets -= bbox_preds['aggregated_points'] centerness_weights = (positive_mask + negative_mask).unsqueeze(-1).repeat( 1, 1, self.num_classes).float() centerness_weights = centerness_weights / \ (centerness_weights.sum() + 1e-6) vote_mask = vote_mask / (vote_mask.sum() + 1e-6) box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6) batch_size, proposal_num = dir_class_targets.shape[:2] heading_label_one_hot = dir_class_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) heading_res_loss_weight = heading_label_one_hot * \ box_loss_weights.unsqueeze(-1) return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask, centerness_weights, box_loss_weights, heading_res_loss_weight) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None, seed_points=None): """Generate targets of ssd3d head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (None | torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (None | torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from candidate points layer. seed_points (torch.Tensor): Seed points of candidate points. Returns: tuple[torch.Tensor]: Targets of ssd3d head. """ assert self.bbox_coder.with_rot or pts_semantic_mask is not None gt_bboxes_3d = gt_bboxes_3d.to(points.device) valid_gt = gt_labels_3d != -1 gt_bboxes_3d = gt_bboxes_3d[valid_gt] gt_labels_3d = gt_labels_3d[valid_gt] # Generate fake GT for empty scene if valid_gt.sum() == 0: vote_targets = points.new_zeros(self.num_candidates, 3) center_targets = points.new_zeros(self.num_candidates, 3) size_res_targets = points.new_zeros(self.num_candidates, 3) dir_class_targets = points.new_zeros( self.num_candidates, dtype=torch.int64) dir_res_targets = points.new_zeros(self.num_candidates) mask_targets = points.new_zeros( self.num_candidates, dtype=torch.int64) centerness_targets = points.new_zeros(self.num_candidates, self.num_classes) corner3d_targets = points.new_zeros(self.num_candidates, 8, 3) vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool) positive_mask = points.new_zeros( self.num_candidates, dtype=torch.bool) negative_mask = points.new_ones( self.num_candidates, dtype=torch.bool) return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) gt_corner3d = gt_bboxes_3d.corners (center_targets, size_targets, dir_class_targets, dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d) points_mask, assignment = self._assign_targets_by_points_inside( gt_bboxes_3d, aggregated_points) center_targets = center_targets[assignment] size_res_targets = size_targets[assignment] mask_targets = gt_labels_3d[assignment] dir_class_targets = dir_class_targets[assignment] dir_res_targets = dir_res_targets[assignment] corner3d_targets = gt_corner3d[assignment] top_center_targets = center_targets.clone() top_center_targets[:, 2] += size_res_targets[:, 2] dist = torch.norm(aggregated_points - top_center_targets, dim=1) dist_mask = dist < self.train_cfg.pos_distance_thr positive_mask = (points_mask.max(1)[0] > 0) * dist_mask negative_mask = (points_mask.max(1)[0] == 0) # Centerness loss targets canonical_xyz = aggregated_points - center_targets if self.bbox_coder.with_rot: # TODO: Align points rotation implementation of # LiDARInstance3DBoxes and DepthInstance3DBoxes canonical_xyz = rotation_3d_in_axis( canonical_xyz.unsqueeze(0).transpose(0, 1), -gt_bboxes_3d.yaw[assignment], 2).squeeze(1) distance_front = torch.clamp( size_res_targets[:, 0] - canonical_xyz[:, 0], min=0) distance_back = torch.clamp( size_res_targets[:, 0] + canonical_xyz[:, 0], min=0) distance_left = torch.clamp( size_res_targets[:, 1] - canonical_xyz[:, 1], min=0) distance_right = torch.clamp( size_res_targets[:, 1] + canonical_xyz[:, 1], min=0) distance_top = torch.clamp( size_res_targets[:, 2] - canonical_xyz[:, 2], min=0) distance_bottom = torch.clamp( size_res_targets[:, 2] + canonical_xyz[:, 2], min=0) centerness_l = torch.min(distance_front, distance_back) / torch.max( distance_front, distance_back) centerness_w = torch.min(distance_left, distance_right) / torch.max( distance_left, distance_right) centerness_h = torch.min(distance_bottom, distance_top) / torch.max( distance_bottom, distance_top) centerness_targets = torch.clamp( centerness_l * centerness_w * centerness_h, min=0) centerness_targets = centerness_targets.pow(1 / 3.0) centerness_targets = torch.clamp(centerness_targets, min=0, max=1) proposal_num = centerness_targets.shape[0] one_hot_centerness_targets = centerness_targets.new_zeros( (proposal_num, self.num_classes)) one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1) centerness_targets = centerness_targets.unsqueeze( 1) * one_hot_centerness_targets # Vote loss targets enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box( self.train_cfg.expand_dims_length) enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length vote_mask, vote_assignment = self._assign_targets_by_points_inside( enlarged_gt_bboxes_3d, seed_points) vote_targets = gt_bboxes_3d.gravity_center vote_targets = vote_targets[vote_assignment] - seed_points vote_mask = vote_mask.max(1)[0] > 0 return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False): """Generate bboxes from sdd3d head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from sdd3d head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes sem_scores = F.sigmoid(bbox_preds['obj_scores']).transpose(1, 2) obj_scores = sem_scores.max(-1)[0] bbox3d = self.bbox_coder.decode(bbox_preds) batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = self.multiclass_nms_single( obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected.clone(), box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ num_bbox = bbox.shape[0] bbox = input_meta['box_type_3d']( bbox.clone(), box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 1.0)) if isinstance(bbox, LiDARInstance3DBoxes): box_idx = bbox.points_in_boxes(points) box_indices = box_idx.new_zeros([num_bbox + 1]) box_idx[box_idx == -1] = num_bbox box_indices.scatter_add_(0, box_idx.long(), box_idx.new_ones(box_idx.shape)) box_indices = box_indices[:-1] nonempty_box_mask = box_indices >= 0 elif isinstance(bbox, DepthInstance3DBoxes): box_indices = bbox.points_in_boxes(points) nonempty_box_mask = box_indices.T.sum(1) >= 0 else: raise NotImplementedError('Unsupported bbox type!') corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] bbox_classes = torch.argmax(sem_scores, -1) nms_selected = batched_nms( minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_cfg)[1] if nms_selected.shape[0] > self.test_cfg.max_output_num: nms_selected = nms_selected[:self.test_cfg.max_output_num] # filter empty boxes and boxes with low score scores_mask = (obj_scores >= self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels def _assign_targets_by_points_inside(self, bboxes_3d, points): """Compute assignment by checking whether point is inside bbox. Args: bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes. points (torch.Tensor): Points of a batch. Returns: tuple[torch.Tensor]: Flags indicating whether each point is inside bbox and the index of box where each point are in. """ # TODO: align points_in_boxes function in each box_structures num_bbox = bboxes_3d.tensor.shape[0] if isinstance(bboxes_3d, LiDARInstance3DBoxes): assignment = bboxes_3d.points_in_boxes(points).long() points_mask = assignment.new_zeros( [assignment.shape[0], num_bbox + 1]) assignment[assignment == -1] = num_bbox points_mask.scatter_(1, assignment.unsqueeze(1), 1) points_mask = points_mask[:, :-1] assignment[assignment == num_bbox] = num_bbox - 1 elif isinstance(bboxes_3d, DepthInstance3DBoxes): points_mask = bboxes_3d.points_in_boxes(points) assignment = points_mask.argmax(dim=-1) else: raise NotImplementedError('Unsupported bbox type!') return points_mask, assignment ================================================ FILE: mmdet3d/models/dense_heads/train_mixins.py ================================================ import numpy as np import torch from mmdet3d.core import limit_period from mmdet.core import images_to_levels, multi_apply class AnchorTrainMixin(object): """Mixin class for target assigning of dense heads.""" def anchor_target_3d(self, anchor_list, gt_bboxes_list, input_metas, gt_bboxes_ignore_list=None, gt_labels_list=None, label_channels=1, num_classes=1, sampling=True): """Compute regression and classification targets for anchors. Args: anchor_list (list[list]): Multi level anchors of each image. gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each image. input_metas (list[dict]): Meta info of each image. gt_bboxes_ignore_list (None | list): Ignore list of gt bboxes. gt_labels_list (list[torch.Tensor]): Gt labels of batches. label_channels (int): The channel of labels. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple (list, list, list, list, list, list, int, int): Anchor targets, including labels, label weights, bbox targets, bbox weights, direction targets, direction weights, number of postive anchors and number of negative anchors. """ num_imgs = len(input_metas) assert len(anchor_list) == num_imgs if isinstance(anchor_list[0][0], list): # sizes of anchors are different # anchor number of a single level num_level_anchors = [ sum([anchor.size(0) for anchor in anchors]) for anchors in anchor_list[0] ] for i in range(num_imgs): anchor_list[i] = anchor_list[i][0] else: # anchor number of multi levels num_level_anchors = [ anchors.view(-1, self.box_code_size).size(0) for anchors in anchor_list[0] ] # concat all level anchors and flags to a single tensor for i in range(num_imgs): anchor_list[i] = torch.cat(anchor_list[i]) # compute targets for each image if gt_bboxes_ignore_list is None: gt_bboxes_ignore_list = [None for _ in range(num_imgs)] if gt_labels_list is None: gt_labels_list = [None for _ in range(num_imgs)] (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, all_dir_targets, all_dir_weights, pos_inds_list, neg_inds_list) = multi_apply( self.anchor_target_3d_single, anchor_list, gt_bboxes_list, gt_bboxes_ignore_list, gt_labels_list, input_metas, label_channels=label_channels, num_classes=num_classes, sampling=sampling) # no valid anchors if any([labels is None for labels in all_labels]): return None # sampled anchors of all images num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list]) num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list]) # split targets to a list w.r.t. multiple levels labels_list = images_to_levels(all_labels, num_level_anchors) label_weights_list = images_to_levels(all_label_weights, num_level_anchors) bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors) bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors) dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors) dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) def anchor_target_3d_single(self, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, label_channels=1, num_classes=1, sampling=True): """Compute targets of anchors in single batch. Args: anchors (torch.Tensor): Concatenated multi-level anchor. gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes. gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes. gt_labels (torch.Tensor): Gt class labels. input_meta (dict): Meta info of each image. label_channels (int): The channel of labels. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple[torch.Tensor]: Anchor targets. """ if isinstance(self.bbox_assigner, list) and (not isinstance(anchors, list)): feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2) rot_angles = anchors.size(-2) assert len(self.bbox_assigner) == anchors.size(-3) (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], [] current_anchor_num = 0 for i, assigner in enumerate(self.bbox_assigner): current_anchors = anchors[..., i, :, :].reshape( -1, self.box_code_size) current_anchor_num += current_anchors.size(0) if self.assign_per_class: gt_per_cls = (gt_labels == i) anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes[gt_per_cls, :], gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta, num_classes, sampling) else: anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets total_labels.append(labels.reshape(feat_size, 1, rot_angles)) total_label_weights.append( label_weights.reshape(feat_size, 1, rot_angles)) total_bbox_targets.append( bbox_targets.reshape(feat_size, 1, rot_angles, anchors.size(-1))) total_bbox_weights.append( bbox_weights.reshape(feat_size, 1, rot_angles, anchors.size(-1))) total_dir_targets.append( dir_targets.reshape(feat_size, 1, rot_angles)) total_dir_weights.append( dir_weights.reshape(feat_size, 1, rot_angles)) total_pos_inds.append(pos_inds) total_neg_inds.append(neg_inds) total_labels = torch.cat(total_labels, dim=-2).reshape(-1) total_label_weights = torch.cat( total_label_weights, dim=-2).reshape(-1) total_bbox_targets = torch.cat( total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1)) total_bbox_weights = torch.cat( total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1)) total_dir_targets = torch.cat( total_dir_targets, dim=-2).reshape(-1) total_dir_weights = torch.cat( total_dir_weights, dim=-2).reshape(-1) total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1) total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1) return (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) elif isinstance(self.bbox_assigner, list) and isinstance( anchors, list): # class-aware anchors with different feature map sizes assert len(self.bbox_assigner) == len(anchors), \ 'The number of bbox assigners and anchors should be the same.' (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], [] current_anchor_num = 0 for i, assigner in enumerate(self.bbox_assigner): current_anchors = anchors[i] current_anchor_num += current_anchors.size(0) if self.assign_per_class: gt_per_cls = (gt_labels == i) anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes[gt_per_cls, :], gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta, num_classes, sampling) else: anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets total_labels.append(labels) total_label_weights.append(label_weights) total_bbox_targets.append( bbox_targets.reshape(-1, anchors[i].size(-1))) total_bbox_weights.append( bbox_weights.reshape(-1, anchors[i].size(-1))) total_dir_targets.append(dir_targets) total_dir_weights.append(dir_weights) total_pos_inds.append(pos_inds) total_neg_inds.append(neg_inds) total_labels = torch.cat(total_labels, dim=0) total_label_weights = torch.cat(total_label_weights, dim=0) total_bbox_targets = torch.cat(total_bbox_targets, dim=0) total_bbox_weights = torch.cat(total_bbox_weights, dim=0) total_dir_targets = torch.cat(total_dir_targets, dim=0) total_dir_weights = torch.cat(total_dir_weights, dim=0) total_pos_inds = torch.cat(total_pos_inds, dim=0) total_neg_inds = torch.cat(total_neg_inds, dim=0) return (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) else: return self.anchor_target_single_assigner(self.bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) def anchor_target_single_assigner(self, bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes=1, sampling=True): """Assign anchors and encode positive anchors. Args: bbox_assigner (BaseAssigner): assign positive and negative boxes. anchors (torch.Tensor): Concatenated multi-level anchor. gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes. gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes. gt_labels (torch.Tensor): Gt class labels. input_meta (dict): Meta info of each image. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple[torch.Tensor]: Anchor targets. """ anchors = anchors.reshape(-1, anchors.size(-1)) num_valid_anchors = anchors.shape[0] bbox_targets = torch.zeros_like(anchors) bbox_weights = torch.zeros_like(anchors) dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long) dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float) labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long) label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) if len(gt_bboxes) > 0: if not isinstance(gt_bboxes, torch.Tensor): gt_bboxes = gt_bboxes.tensor.to(anchors.device) assign_result = bbox_assigner.assign(anchors, gt_bboxes, gt_bboxes_ignore, gt_labels) sampling_result = self.bbox_sampler.sample(assign_result, anchors, gt_bboxes) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds else: pos_inds = torch.nonzero( anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0, as_tuple=False).squeeze(-1).unique() if gt_labels is not None: labels += num_classes if len(pos_inds) > 0: pos_bbox_targets = self.bbox_coder.encode( sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) pos_dir_targets = get_direction_target( sampling_result.pos_bboxes, pos_bbox_targets, self.dir_offset, one_hot=False) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 dir_targets[pos_inds] = pos_dir_targets dir_weights[pos_inds] = 1.0 if gt_labels is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[ sampling_result.pos_assigned_gt_inds] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 return (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) def get_direction_target(anchors, reg_targets, dir_offset=0, num_bins=2, one_hot=True): """Encode direction to 0 ~ num_bins-1. Args: anchors (torch.Tensor): Concatenated multi-level anchor. reg_targets (torch.Tensor): Bbox regression targets. dir_offset (int): Direction offset. num_bins (int): Number of bins to divide 2*PI. one_hot (bool): Whether to encode as one hot. Returns: torch.Tensor: Encoded direction targets. """ rot_gt = reg_targets[..., 6] + anchors[..., 6] offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi) dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) if one_hot: dir_targets = torch.zeros( *list(dir_cls_targets.shape), num_bins, dtype=anchors.dtype, device=dir_cls_targets.device) dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0) dir_cls_targets = dir_targets return dir_cls_targets ================================================ FILE: mmdet3d/models/dense_heads/transfusion_head.py ================================================ import copy import numpy as np import torch from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init from mmcv.runner import force_fp32 from torch import nn import torch.nn.functional as F from torch.nn.parameter import Parameter from torch.nn import Linear from torch.nn.init import xavier_uniform_, constant_ from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr, limit_period, PseudoSampler) from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.core import Box3DMode, LiDARInstance3DBoxes from mmdet3d.models import builder from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.models.utils import clip_sigmoid from mmdet3d.models.fusion_layers import apply_3d_transformation from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult from mmdet3d.ops.roiaware_pool3d import points_in_boxes_batch class PositionEmbeddingLearned(nn.Module): """ Absolute pos embedding, learned. """ def __init__(self, input_channel, num_pos_feats=288): super().__init__() self.position_embedding_head = nn.Sequential( nn.Conv1d(input_channel, num_pos_feats, kernel_size=1), nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True), nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1)) def forward(self, xyz): xyz = xyz.transpose(1, 2).contiguous() position_embedding = self.position_embedding_head(xyz) return position_embedding class TransformerDecoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", self_posembed=None, cross_posembed=None, cross_only=False): super().__init__() self.cross_only = cross_only if not self.cross_only: self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") self.activation = _get_activation_fn(activation) self.self_posembed = self_posembed self.cross_posembed = cross_posembed def with_pos_embed(self, tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, query, key, query_pos, key_pos, attn_mask=None): """ :param query: B C Pq :param key: B C Pk :param query_pos: B Pq 3/6 :param key_pos: B Pk 3/6 :param value_pos: [B Pq 3/6] :return: """ # NxCxP to PxNxC if self.self_posembed is not None: query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1) else: query_pos_embed = None if self.cross_posembed is not None: key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1) else: key_pos_embed = None query = query.permute(2, 0, 1) key = key.permute(2, 0, 1) if not self.cross_only: q = k = v = self.with_pos_embed(query, query_pos_embed) query2 = self.self_attn(q, k, value=v)[0] query = query + self.dropout1(query2) query = self.norm1(query) query2 = self.multihead_attn(query=self.with_pos_embed(query, query_pos_embed), key=self.with_pos_embed(key, key_pos_embed), value=self.with_pos_embed(key, key_pos_embed), attn_mask=attn_mask)[0] query = query + self.dropout2(query2) query = self.norm2(query) query2 = self.linear2(self.dropout(self.activation(self.linear1(query)))) query = query + self.dropout3(query2) query = self.norm3(query) # NxCxP to PxNxC query = query.permute(1, 2, 0) return query class MultiheadAttention(nn.Module): r"""Allows the model to jointly attend to information from different representation subspaces. See reference: Attention Is All You Need .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) Args: embed_dim: total dimension of the model. num_heads: parallel attention heads. dropout: a Dropout layer on attn_output_weights. Default: 0.0. bias: add bias as module parameter. Default: True. add_bias_kv: add bias to the key and value sequences at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. kdim: total number of features in key. Default: None. vdim: total number of features in key. Default: None. Note: if kdim and vdim are None, they will be set to embed_dim such that query, key, and value have the same number of features. Examples:: >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) if self._qkv_same_embed_dim is False: self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) if bias: self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = Linear(embed_dim, embed_dim, bias=bias) if add_bias_kv: self.bias_k = Parameter(torch.empty(1, 1, embed_dim)) self.bias_v = Parameter(torch.empty(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self._reset_parameters() def _reset_parameters(self): if self._qkv_same_embed_dim: xavier_uniform_(self.in_proj_weight) else: xavier_uniform_(self.q_proj_weight) xavier_uniform_(self.k_proj_weight) xavier_uniform_(self.v_proj_weight) if self.in_proj_bias is not None: constant_(self.in_proj_bias, 0.) constant_(self.out_proj.bias, 0.) if self.bias_k is not None: xavier_normal_(self.bias_k) if self.bias_v is not None: xavier_normal_(self.bias_v) def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None): r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer will be filled with -inf. need_weights: output attn_output_weights. attn_mask: mask that prevents attention to certain positions. This is an additive mask (i.e. the values will be added to the attention layer). Shape: - Inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length. - attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - Outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False: return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, v_proj_weight=self.v_proj_weight) else: if not hasattr(self, '_qkv_same_embed_dim'): warnings.warn('A new version of MultiheadAttention module has been implemented. \ Please re-train your model with the new module', UserWarning) return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask) def multi_head_attention_forward(query, # type: Tensor key, # type: Tensor value, # type: Tensor embed_dim_to_check, # type: int num_heads, # type: int in_proj_weight, # type: Tensor in_proj_bias, # type: Tensor bias_k, # type: Optional[Tensor] bias_v, # type: Optional[Tensor] add_zero_attn, # type: bool dropout_p, # type: float out_proj_weight, # type: Tensor out_proj_bias, # type: Tensor training=True, # type: bool key_padding_mask=None, # type: Optional[Tensor] need_weights=True, # type: bool attn_mask=None, # type: Optional[Tensor] use_separate_proj_weight=False, # type: bool q_proj_weight=None, # type: Optional[Tensor] k_proj_weight=None, # type: Optional[Tensor] v_proj_weight=None, # type: Optional[Tensor] static_k=None, # type: Optional[Tensor] static_v=None, # type: Optional[Tensor] ): # type: (...) -> Tuple[Tensor, Optional[Tensor]] r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. embed_dim_to_check: total dimension of the model. num_heads: parallel attention heads. in_proj_weight, in_proj_bias: input projection weight and bias. bias_k, bias_v: bias of the key and value sequences to be added at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. dropout_p: probability of an element to be zeroed. out_proj_weight, out_proj_bias: the output projection weight and bias. training: apply dropout if is ``True``. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer will be filled with -inf. need_weights: output attn_output_weights. attn_mask: mask that prevents attention to certain positions. This is an additive mask (i.e. the values will be added to the attention layer). use_separate_proj_weight: the function accept the proj. weights for query, key, and value in differnt forms. If false, in_proj_weight will be used, which is a combination of q_proj_weight, k_proj_weight, v_proj_weight. q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. static_k, static_v: static key and value used for attention operators. Shape: Inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length. - attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. Outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ qkv_same = torch.equal(query, key) and torch.equal(key, value) kv_same = torch.equal(key, value) tgt_len, bsz, embed_dim = query.size() assert embed_dim == embed_dim_to_check assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 if use_separate_proj_weight is not True: if qkv_same: # self-attention q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1) elif kv_same: # encoder-decoder attention # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = F.linear(query, _w, _b) if key is None: assert value is None k = None v = None else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] k, v = F.linear(key, _w, _b).chunk(2, dim=-1) else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = F.linear(query, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = embed_dim * 2 _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] k = F.linear(key, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim * 2 _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] v = F.linear(value, _w, _b) else: q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight) len1, len2 = q_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == query.size(-1) k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight) len1, len2 = k_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == key.size(-1) v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight) len1, len2 = v_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == value.size(-1) if in_proj_bias is not None: q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim]) k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)]) v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):]) else: q = F.linear(query, q_proj_weight_non_opt, in_proj_bias) k = F.linear(key, k_proj_weight_non_opt, in_proj_bias) v = F.linear(value, v_proj_weight_non_opt, in_proj_bias) q = q * scaling if bias_k is not None and bias_v is not None: if static_k is None and static_v is None: k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat( [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1), dtype=key_padding_mask.dtype, device=key_padding_mask.device)], dim=1) else: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." else: assert bias_k is None assert bias_v is None q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if static_k is not None: assert static_k.size(0) == bsz * num_heads assert static_k.size(2) == head_dim k = static_k if static_v is not None: assert static_v.size(0) == bsz * num_heads assert static_v.size(2) == head_dim v = static_v src_len = k.size(1) if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if add_zero_attn: src_len += 1 k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1) v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: if len(attn_mask.shape) == 2: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1) else: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), attn_mask.size(1), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=2) if key_padding_mask is not None: key_padding_mask = torch.cat( [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1), dtype=key_padding_mask.dtype, device=key_padding_mask.device)], dim=1) attn_output_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len] if attn_mask is not None: if len(attn_mask.shape) == 2: attn_mask = attn_mask.unsqueeze(0) else: attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1) attn_mask = attn_mask.reshape(attn_mask.size(0)*num_heads, attn_mask.size(2), attn_mask.size(3)) attn_output_weights += attn_mask if key_padding_mask is not None: attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) attn_output_weights = attn_output_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'), ) attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len) attn_output_weights = F.softmax( attn_output_weights, dim=-1) attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training) attn_output = torch.bmm(attn_output_weights, v) assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None class FFN(nn.Module): def __init__(self, in_channels, heads, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), bias='auto', **kwargs): super(FFN, self).__init__() self.heads = heads self.init_bias = init_bias for head in self.heads: classes, num_conv = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) c_in = head_conv conv_layers.append( build_conv_layer( conv_cfg, head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) def init_weights(self): """Initialize weights.""" for head in self.heads: if head == 'heatmap': self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) else: for m in self.__getattr__(head).modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 1, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ ret_dict = dict() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) return ret_dict @HEADS.register_module() class TransFusionHead(nn.Module): def __init__(self, fuse_img=False, num_views=0, in_channels_img=64, out_size_factor_img=4, num_proposals=128, auxiliary=True, in_channels=128 * 3, hidden_channel=128, num_classes=4, # config for Transformer num_decoder_layers=3, num_heads=8, learnable_query_pos=False, initialize_by_heatmap=False, nms_kernel_size=1, ffn_channel=256, dropout=0.1, bn_momentum=0.1, activation='relu', # config for FFN common_heads=dict(), num_heatmap_convs=2, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), bias='auto', # loss loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_iou=dict(type='VarifocalLoss', use_sigmoid=True, iou_weighted=True, reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean'), loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean'), # others train_cfg=None, test_cfg=None, bbox_coder=None, ): super(TransFusionHead, self).__init__() self.num_classes = num_classes self.num_proposals = num_proposals self.auxiliary = auxiliary self.in_channels = in_channels self.num_heads = num_heads self.num_decoder_layers = num_decoder_layers self.bn_momentum = bn_momentum self.learnable_query_pos = learnable_query_pos self.initialize_by_heatmap = initialize_by_heatmap self.nms_kernel_size = nms_kernel_size if self.initialize_by_heatmap is True: assert self.learnable_query_pos is False, "initialized by heatmap is conflicting with learnable query position" self.train_cfg = train_cfg self.test_cfg = test_cfg self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) if not self.use_sigmoid_cls: self.num_classes += 1 self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) self.loss_heatmap = build_loss(loss_heatmap) self.bbox_coder = build_bbox_coder(bbox_coder) self.sampling = False # a shared convolution self.shared_conv = build_conv_layer( dict(type='Conv2d'), in_channels, hidden_channel, kernel_size=3, padding=1, bias=bias, ) if self.initialize_by_heatmap: layers = [] layers.append(ConvModule( hidden_channel, hidden_channel, kernel_size=3, padding=1, bias=bias, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), )) layers.append(build_conv_layer( dict(type='Conv2d'), hidden_channel, num_classes, kernel_size=3, padding=1, bias=bias, )) self.heatmap_head = nn.Sequential(*layers) self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1) else: # query feature self.query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_proposals)) self.query_pos = nn.Parameter(torch.rand([1, self.num_proposals, 2]), requires_grad=learnable_query_pos) # transformer decoder layers for object query with LiDAR feature self.decoder = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder.append( TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, dropout, activation, self_posembed=PositionEmbeddingLearned(2, hidden_channel), cross_posembed=PositionEmbeddingLearned(2, hidden_channel), )) # Prediction Head self.prediction_heads = nn.ModuleList() for i in range(self.num_decoder_layers): heads = copy.deepcopy(common_heads) heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs))) self.prediction_heads.append(FFN(hidden_channel, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)) self.fuse_img = fuse_img if self.fuse_img: self.num_views = num_views self.out_size_factor_img = out_size_factor_img self.shared_conv_img = build_conv_layer( dict(type='Conv2d'), in_channels_img, # channel of img feature map hidden_channel, kernel_size=3, padding=1, bias=bias, ) if self.initialize_by_heatmap: self.heatmap_head_img = copy.deepcopy(self.heatmap_head) # transformer decoder layers for img fusion self.decoder.append( TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, dropout, activation, self_posembed=PositionEmbeddingLearned(2, hidden_channel), cross_posembed=PositionEmbeddingLearned(2, hidden_channel), )) # cross-attention only layers for projecting img feature onto BEV for i in range(num_views): self.decoder.append( TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, dropout, activation, self_posembed=PositionEmbeddingLearned(2, hidden_channel), cross_posembed=PositionEmbeddingLearned(2, hidden_channel), cross_only=True, )) self.fc = nn.Sequential(*[nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)]) heads = copy.deepcopy(common_heads) heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs))) self.prediction_heads.append(FFN(hidden_channel * 2, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)) self.init_weights() self._init_assigner_sampler() # Position Embedding for Cross-Attention, which is re-used during training x_size = self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'] y_size = self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor'] self.bev_pos = self.create_2D_grid(x_size, y_size) self.img_feat_pos = None self.img_feat_collapsed_pos = None def create_2D_grid(self, x_size, y_size): meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]] batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) batch_x = batch_x + 0.5 batch_y = batch_y + 0.5 coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None] coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1) return coord_base def init_weights(self): # initialize transformer for m in self.decoder.parameters(): if m.dim() > 1: nn.init.xavier_uniform_(m) if hasattr(self, 'query'): nn.init.xavier_normal_(self.query) self.init_bn_momentum() def init_bn_momentum(self): for m in self.modules(): if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)): m.momentum = self.bn_momentum def _init_assigner_sampler(self): """Initialize the target assigner and sampler of the head.""" if self.train_cfg is None: return if self.sampling: self.bbox_sampler = build_sampler(self.train_cfg.sampler) else: self.bbox_sampler = PseudoSampler() if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] def forward_single(self, inputs, img_inputs, img_metas): """Forward function for CenterPoint. Args: inputs (torch.Tensor): Input feature map with the shape of [B, 512, 128(H), 128(W)]. (consistent with L748) Returns: list[dict]: Output results for tasks. """ batch_size = inputs.shape[0] lidar_feat = self.shared_conv(inputs) ################################# # image to BEV ################################# lidar_feat_flatten = lidar_feat.view(batch_size, lidar_feat.shape[1], -1) # [BS, C, H*W] bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) if self.fuse_img: img_feat = self.shared_conv_img(img_inputs) # [BS * n_views, C, H, W] img_h, img_w, num_channel = img_inputs.shape[-2], img_inputs.shape[-1], img_feat.shape[1] raw_img_feat = img_feat.view(batch_size, self.num_views, num_channel, img_h, img_w).permute(0, 2, 3, 1, 4) # [BS, C, H, n_views, W] img_feat = raw_img_feat.reshape(batch_size, num_channel, img_h, img_w * self.num_views) # [BS, C, H, n_views*W] img_feat_collapsed = img_feat.max(2).values img_feat_collapsed = self.fc(img_feat_collapsed).view(batch_size, num_channel, img_w * self.num_views) # positional encoding for image guided query initialization if self.img_feat_collapsed_pos is None: img_feat_collapsed_pos = self.img_feat_collapsed_pos = self.create_2D_grid(1, img_feat_collapsed.shape[-1]).to(img_feat.device) else: img_feat_collapsed_pos = self.img_feat_collapsed_pos bev_feat = lidar_feat_flatten for idx_view in range(self.num_views): bev_feat = self.decoder[2 + idx_view](bev_feat, img_feat_collapsed[..., img_w * idx_view:img_w * (idx_view + 1)], bev_pos, img_feat_collapsed_pos[:, img_w * idx_view:img_w * (idx_view + 1)]) ################################# # image guided query initialization ################################# if self.initialize_by_heatmap: dense_heatmap = self.heatmap_head(lidar_feat) dense_heatmap_img = None if self.fuse_img: dense_heatmap_img = self.heatmap_head_img(bev_feat.view(lidar_feat.shape)) # [BS, num_classes, H, W] heatmap = (dense_heatmap.detach().sigmoid() + dense_heatmap_img.detach().sigmoid()) / 2 else: heatmap = dense_heatmap.detach().sigmoid() padding = self.nms_kernel_size // 2 local_max = torch.zeros_like(heatmap) # equals to nms radius = voxel_size * out_size_factor * kenel_size local_max_inner = F.max_pool2d(heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0) local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner ## for Pedestrian & Traffic_cone in nuScenes if self.test_cfg['dataset'] == 'nuScenes': local_max[:, 8, ] = F.max_pool2d(heatmap[:, 8], kernel_size=1, stride=1, padding=0) local_max[:, 9, ] = F.max_pool2d(heatmap[:, 9], kernel_size=1, stride=1, padding=0) elif self.test_cfg['dataset'] == 'Waymo': # for Pedestrian & Cyclist in Waymo local_max[:, 1, ] = F.max_pool2d(heatmap[:, 1], kernel_size=1, stride=1, padding=0) local_max[:, 2, ] = F.max_pool2d(heatmap[:, 2], kernel_size=1, stride=1, padding=0) heatmap = heatmap * (heatmap == local_max) heatmap = heatmap.view(batch_size, heatmap.shape[1], -1) # top #num_proposals among all classes top_proposals = heatmap.view(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_proposals] top_proposals_class = top_proposals // heatmap.shape[-1] top_proposals_index = top_proposals % heatmap.shape[-1] query_feat = lidar_feat_flatten.gather(index=top_proposals_index[:, None, :].expand(-1, lidar_feat_flatten.shape[1], -1), dim=-1) self.query_labels = top_proposals_class # add category embedding one_hot = F.one_hot(top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) query_cat_encoding = self.class_encoding(one_hot.float()) query_feat += query_cat_encoding query_pos = bev_pos.gather(index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), dim=1) else: query_feat = self.query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals] query_pos = self.query_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, num_proposals, 2] ################################# # transformer decoder layer (LiDAR feature as K,V) ################################# ret_dicts = [] for i in range(self.num_decoder_layers): prefix = 'last_' if (i == self.num_decoder_layers - 1) else f'{i}head_' # Transformer Decoder Layer # :param query: B C Pq :param query_pos: B Pq 3/6 query_feat = self.decoder[i](query_feat, lidar_feat_flatten, query_pos, bev_pos) # Prediction res_layer = self.prediction_heads[i](query_feat) res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1) first_res_layer = res_layer if not self.fuse_img: ret_dicts.append(res_layer) # for next level positional embedding query_pos = res_layer['center'].detach().clone().permute(0, 2, 1) ################################# # transformer decoder layer (img feature as K,V) ################################# if self.fuse_img: # positional encoding for image fusion img_feat = raw_img_feat.permute(0, 3, 1, 2, 4) # [BS, n_views, C, H, W] img_feat_flatten = img_feat.view(batch_size, self.num_views, num_channel, -1) # [BS, n_views, C, H*W] if self.img_feat_pos is None: (h, w) = img_inputs.shape[-2], img_inputs.shape[-1] img_feat_pos = self.img_feat_pos = self.create_2D_grid(h, w).to(img_feat_flatten.device) else: img_feat_pos = self.img_feat_pos prev_query_feat = query_feat.detach().clone() query_feat = torch.zeros_like(query_feat) # create new container for img query feature query_pos_realmetric = query_pos.permute(0, 2, 1) * self.test_cfg['out_size_factor'] * self.test_cfg['voxel_size'][0] + self.test_cfg['pc_range'][0] query_pos_3d = torch.cat([query_pos_realmetric, res_layer['height']], dim=1).detach().clone() if 'vel' in res_layer: vel = copy.deepcopy(res_layer['vel'].detach()) else: vel = None pred_boxes = self.bbox_coder.decode( copy.deepcopy(res_layer['heatmap'].detach()), copy.deepcopy(res_layer['rot'].detach()), copy.deepcopy(res_layer['dim'].detach()), copy.deepcopy(res_layer['center'].detach()), copy.deepcopy(res_layer['height'].detach()), vel, ) on_the_image_mask = torch.ones([batch_size, self.num_proposals]).to(query_pos_3d.device) * -1 for sample_idx in range(batch_size if self.fuse_img else 0): lidar2img_rt = query_pos_3d.new_tensor(img_metas[sample_idx]['lidar2img']) img_scale_factor = ( query_pos_3d.new_tensor(img_metas[sample_idx]['scale_factor'][:2] if 'scale_factor' in img_metas[sample_idx].keys() else [1.0, 1.0]) ) img_flip = img_metas[sample_idx]['flip'] if 'flip' in img_metas[sample_idx].keys() else False img_crop_offset = ( query_pos_3d.new_tensor(img_metas[sample_idx]['img_crop_offset']) if 'img_crop_offset' in img_metas[sample_idx].keys() else 0) img_shape = img_metas[sample_idx]['img_shape'][:2] img_pad_shape = img_metas[sample_idx]['input_shape'][:2] boxes = LiDARInstance3DBoxes(pred_boxes[sample_idx]['bboxes'][:, :7], box_dim=7) query_pos_3d_with_corners = torch.cat([query_pos_3d[sample_idx], boxes.corners.permute(2, 0, 1).view(3, -1)], dim=-1) # [3, num_proposals] + [3, num_proposals*8] # transform point clouds back to original coordinate system by reverting the data augmentation if batch_size == 1: # skip during inference to save time points = query_pos_3d_with_corners.T else: points = apply_3d_transformation(query_pos_3d_with_corners.T, 'LIDAR', img_metas[sample_idx], reverse=True).detach() num_points = points.shape[0] for view_idx in range(self.num_views): pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1) pts_2d = pts_4d @ lidar2img_rt[view_idx].t() pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] # img transformation: scale -> crop -> flip # the image is resized by img_scale_factor img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2 img_coors -= img_crop_offset # grid sample, the valid grid range should be in [-1,1] coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1 if img_flip: # by default we take it as horizontal flip # use img_shape before padding for flip orig_h, orig_w = img_shape coor_x = orig_w - coor_x coor_x, coor_corner_x = coor_x[0:self.num_proposals, :], coor_x[self.num_proposals:, :] coor_y, coor_corner_y = coor_y[0:self.num_proposals, :], coor_y[self.num_proposals:, :] coor_corner_x = coor_corner_x.reshape(self.num_proposals, 8, 1) coor_corner_y = coor_corner_y.reshape(self.num_proposals, 8, 1) coor_corner_xy = torch.cat([coor_corner_x, coor_corner_y], dim=-1) h, w = img_pad_shape on_the_image = (coor_x > 0) * (coor_x < w) * (coor_y > 0) * (coor_y < h) on_the_image = on_the_image.squeeze() # skip the following computation if no object query fall on current image if on_the_image.sum() <= 1: continue on_the_image_mask[sample_idx, on_the_image] = view_idx # add spatial constraint center_ys = (coor_y[on_the_image] / self.out_size_factor_img) center_xs = (coor_x[on_the_image] / self.out_size_factor_img) centers = torch.cat([center_xs, center_ys], dim=-1).int() # center on the feature map corners = (coor_corner_xy[on_the_image].max(1).values - coor_corner_xy[on_the_image].min(1).values) / self.out_size_factor_img radius = torch.ceil(corners.norm(dim=-1, p=2) / 2).int() # radius of the minimum circumscribed circle of the wireframe sigma = (radius * 2 + 1) / 6.0 distance = (centers[:, None, :] - (img_feat_pos - 0.5)).norm(dim=-1) ** 2 gaussian_mask = (-distance / (2 * sigma[:, None] ** 2)).exp() gaussian_mask[gaussian_mask < torch.finfo(torch.float32).eps] = 0 attn_mask = gaussian_mask query_feat_view = prev_query_feat[sample_idx, :, on_the_image] query_pos_view = torch.cat([center_xs, center_ys], dim=-1) query_feat_view = self.decoder[self.num_decoder_layers](query_feat_view[None], img_feat_flatten[sample_idx:sample_idx + 1, view_idx], query_pos_view[None], img_feat_pos, attn_mask=attn_mask.log()) query_feat[sample_idx, :, on_the_image] = query_feat_view.clone() self.on_the_image_mask = (on_the_image_mask != -1) res_layer = self.prediction_heads[self.num_decoder_layers](torch.cat([query_feat, prev_query_feat], dim=1)) res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1) for key, value in res_layer.items(): pred_dim = value.shape[1] res_layer[key][~self.on_the_image_mask.unsqueeze(1).repeat(1, pred_dim, 1)] = first_res_layer[key][~self.on_the_image_mask.unsqueeze(1).repeat(1, pred_dim, 1)] ret_dicts.append(res_layer) if self.initialize_by_heatmap: ret_dicts[0]['query_heatmap_score'] = heatmap.gather(index=top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals] if self.fuse_img: ret_dicts[0]['dense_heatmap'] = dense_heatmap_img else: ret_dicts[0]['dense_heatmap'] = dense_heatmap if self.auxiliary is False: # only return the results of last decoder layer return [ret_dicts[-1]] # return all the layer's results for auxiliary superivison new_res = {} for key in ret_dicts[0].keys(): if key not in ['dense_heatmap', 'dense_heatmap_old', 'query_heatmap_score']: new_res[key] = torch.cat([ret_dict[key] for ret_dict in ret_dicts], dim=-1) else: new_res[key] = ret_dicts[0][key] return [new_res] def forward(self, feats, img_feats, img_metas): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple(list[dict]): Output results. first index by level, second index by layer """ if img_feats is None: img_feats = [None] res = multi_apply(self.forward_single, feats, img_feats, [img_metas]) assert len(res) == 1, "only support one level features." return res def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict): """Generate training targets. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. preds_dicts (tuple of dict): first index by layer (default 1) Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [BS, num_proposals] - torch.Tensor: classification weights (mask) [BS, num_proposals] - torch.Tensor: regression target. [BS, num_proposals, 8] - torch.Tensor: regression weights. [BS, num_proposals, 8] """ # change preds_dict into list of dict (index by batch_id) # preds_dict[0]['center'].shape [bs, 3, num_proposal] list_of_pred_dict = [] for batch_idx in range(len(gt_bboxes_3d)): pred_dict = {} for key in preds_dict[0].keys(): pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1] list_of_pred_dict.append(pred_dict) assert len(gt_bboxes_3d) == len(list_of_pred_dict) res_tuple = multi_apply(self.get_targets_single, gt_bboxes_3d, gt_labels_3d, list_of_pred_dict, np.arange(len(gt_labels_3d))) labels = torch.cat(res_tuple[0], dim=0) label_weights = torch.cat(res_tuple[1], dim=0) bbox_targets = torch.cat(res_tuple[2], dim=0) bbox_weights = torch.cat(res_tuple[3], dim=0) ious = torch.cat(res_tuple[4], dim=0) num_pos = np.sum(res_tuple[5]) matched_ious = np.mean(res_tuple[6]) if self.initialize_by_heatmap: heatmap = torch.cat(res_tuple[7], dim=0) return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious, heatmap else: return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict, batch_idx): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. preds_dict (dict): dict of prediction result for a single sample Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [1, num_proposals] - torch.Tensor: classification weights (mask) [1, num_proposals] - torch.Tensor: regression target. [1, num_proposals, 8] - torch.Tensor: regression weights. [1, num_proposals, 8] - torch.Tensor: iou target. [1, num_proposals] - int: number of positive proposals """ num_proposals = preds_dict['center'].shape[-1] # get pred boxes, carefully ! donot change the network outputs score = copy.deepcopy(preds_dict['heatmap'].detach()) center = copy.deepcopy(preds_dict['center'].detach()) height = copy.deepcopy(preds_dict['height'].detach()) dim = copy.deepcopy(preds_dict['dim'].detach()) rot = copy.deepcopy(preds_dict['rot'].detach()) if 'vel' in preds_dict.keys(): vel = copy.deepcopy(preds_dict['vel'].detach()) else: vel = None boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox bboxes_tensor = boxes_dict[0]['bboxes'] gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device) # each layer should do label assign seperately. if self.auxiliary: num_layer = self.num_decoder_layers else: num_layer = 1 assign_result_list = [] for idx_layer in range(num_layer): bboxes_tensor_layer = bboxes_tensor[self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1), :] score_layer = score[..., self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1)] if self.train_cfg.assigner.type == 'HungarianAssigner3D': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, gt_labels_3d, score_layer, self.train_cfg) elif self.train_cfg.assigner.type == 'HeuristicAssigner': assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, None, gt_labels_3d, self.query_labels[batch_idx]) else: raise NotImplementedError assign_result_list.append(assign_result) # combine assign result of each layer assign_result_ensemble = AssignResult( num_gts=sum([res.num_gts for res in assign_result_list]), gt_inds=torch.cat([res.gt_inds for res in assign_result_list]), max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]), labels=torch.cat([res.labels for res in assign_result_list]), ) sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_tensor, gt_bboxes_tensor) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds assert len(pos_inds) + len(neg_inds) == num_proposals # create target for loss computation bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device) bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device) ious = assign_result_ensemble.max_overlaps ious = torch.clamp(ious, min=0.0, max=1.0) labels = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long) label_weights = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long) if gt_labels_3d is not None: # default label is -1 labels += self.num_classes # both pos and neg have classification loss, only pos has regression and iou loss if len(pos_inds) > 0: pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_gt_bboxes) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels_3d is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels_3d[sampling_result.pos_assigned_gt_inds] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 # # compute dense heatmap targets if self.initialize_by_heatmap: device = labels.device gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device) grid_size = torch.tensor(self.train_cfg['grid_size']) pc_range = torch.tensor(self.train_cfg['point_cloud_range']) voxel_size = torch.tensor(self.train_cfg['voxel_size']) feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len] heatmap = gt_bboxes_3d.new_zeros(self.num_classes, feature_map_size[1], feature_map_size[0]) for idx in range(len(gt_bboxes_3d)): width = gt_bboxes_3d[idx][3] length = gt_bboxes_3d[idx][4] width = width / voxel_size[0] / self.train_cfg['out_size_factor'] length = length / voxel_size[1] / self.train_cfg['out_size_factor'] if width > 0 and length > 0: radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap']) radius = max(self.train_cfg['min_radius'], int(radius)) x, y = gt_bboxes_3d[idx][0], gt_bboxes_3d[idx][1] coor_x = (x - pc_range[0]) / voxel_size[0] / self.train_cfg['out_size_factor'] coor_y = (y - pc_range[1]) / voxel_size[1] / self.train_cfg['out_size_factor'] center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) center_int = center.to(torch.int32) draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius) mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1) return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou), heatmap[None] else: mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1) return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou) @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): """Loss function for CenterHead. Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. preds_dicts (list[list[dict]]): Output of forward function. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ if self.initialize_by_heatmap: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious, heatmap = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0]) else: labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0]) if hasattr(self, 'on_the_image_mask'): label_weights = label_weights * self.on_the_image_mask bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None] num_pos = bbox_weights.max(-1).values.sum() preds_dict = preds_dicts[0][0] loss_dict = dict() if self.initialize_by_heatmap: # compute heatmap loss loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict['dense_heatmap']), heatmap, avg_factor=max(heatmap.eq(1).float().sum().item(), 1)) loss_dict['loss_heatmap'] = loss_heatmap # compute loss for each layer for idx_layer in range(self.num_decoder_layers if self.auxiliary else 1): if idx_layer == self.num_decoder_layers - 1 or (idx_layer == 0 and self.auxiliary is False): prefix = 'layer_-1' else: prefix = f'layer_{idx_layer}' layer_labels = labels[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1) layer_label_weights = label_weights[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1) layer_score = preds_dict['heatmap'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] layer_cls_score = layer_score.permute(0, 2, 1).reshape(-1, self.num_classes) layer_loss_cls = self.loss_cls(layer_cls_score, layer_labels, layer_label_weights, avg_factor=max(num_pos, 1)) layer_center = preds_dict['center'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] layer_height = preds_dict['height'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] layer_rot = preds_dict['rot'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] layer_dim = preds_dict['dim'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] if 'vel' in preds_dict.keys(): layer_vel = preds_dict['vel'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot, layer_vel], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size] code_weights = self.train_cfg.get('code_weights', None) layer_bbox_weights = bbox_weights[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :] layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(code_weights) layer_bbox_targets = bbox_targets[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :] layer_loss_bbox = self.loss_bbox(preds, layer_bbox_targets, layer_reg_weights, avg_factor=max(num_pos, 1)) # layer_iou = preds_dict['iou'][..., idx_layer*self.num_proposals:(idx_layer+1)*self.num_proposals].squeeze(1) # layer_iou_target = ious[..., idx_layer*self.num_proposals:(idx_layer+1)*self.num_proposals] # layer_loss_iou = self.loss_iou(layer_iou, layer_iou_target, layer_bbox_weights.max(-1).values, avg_factor=max(num_pos, 1)) loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox # loss_dict[f'{prefix}_loss_iou'] = layer_loss_iou loss_dict[f'matched_ious'] = layer_loss_cls.new_tensor(matched_ious) return loss_dict def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False, for_roi=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. Returns: list[list[dict]]: Decoded bbox, scores and labels for each layer & each batch """ rets = [] for layer_id, preds_dict in enumerate(preds_dicts): batch_size = preds_dict[0]['heatmap'].shape[0] batch_score = preds_dict[0]['heatmap'][..., -self.num_proposals:].sigmoid() # if self.loss_iou.loss_weight != 0: # batch_score = torch.sqrt(batch_score * preds_dict[0]['iou'][..., -self.num_proposals:].sigmoid()) one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1) batch_score = batch_score * preds_dict[0]['query_heatmap_score'] * one_hot batch_center = preds_dict[0]['center'][..., -self.num_proposals:] batch_height = preds_dict[0]['height'][..., -self.num_proposals:] batch_dim = preds_dict[0]['dim'][..., -self.num_proposals:] batch_rot = preds_dict[0]['rot'][..., -self.num_proposals:] batch_vel = None if 'vel' in preds_dict[0]: batch_vel = preds_dict[0]['vel'][..., -self.num_proposals:] temp = self.bbox_coder.decode(batch_score, batch_rot, batch_dim, batch_center, batch_height, batch_vel, filter=True) if self.test_cfg['dataset'] == 'nuScenes': self.tasks = [ dict(num_class=8, class_names=[], indices=[0, 1, 2, 3, 4, 5, 6, 7], radius=-1), dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.175), dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.175), ] elif self.test_cfg['dataset'] == 'Waymo': self.tasks = [ dict(num_class=1, class_names=['Car'], indices=[0], radius=0.7), dict(num_class=1, class_names=['Pedestrian'], indices=[1], radius=0.7), dict(num_class=1, class_names=['Cyclist'], indices=[2], radius=0.7), ] ret_layer = [] for i in range(batch_size): boxes3d = temp[i]['bboxes'] scores = temp[i]['scores'] labels = temp[i]['labels'] ## adopt circle nms for different categories if self.test_cfg['nms_type'] != None: keep_mask = torch.zeros_like(scores) for task in self.tasks: task_mask = torch.zeros_like(scores) for cls_idx in task['indices']: task_mask += labels == cls_idx task_mask = task_mask.bool() if task['radius'] > 0: if self.test_cfg['nms_type'] == 'circle': boxes_for_nms = torch.cat([boxes3d[task_mask][:, :2], scores[:, None][task_mask]], dim=1) task_keep_indices = torch.tensor( circle_nms( boxes_for_nms.detach().cpu().numpy(), task['radius'], ) ) else: boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](boxes3d[task_mask][:, :7], 7).bev) top_scores = scores[task_mask] task_keep_indices = nms_gpu( boxes_for_nms, top_scores, thresh=task['radius'], pre_maxsize=self.test_cfg['pre_maxsize'], post_max_size=self.test_cfg['post_maxsize'], ) else: task_keep_indices = torch.arange(task_mask.sum()) if task_keep_indices.shape[0] != 0: keep_indices = torch.where(task_mask != 0)[0][task_keep_indices] keep_mask[keep_indices] = 1 keep_mask = keep_mask.bool() ret = dict(bboxes=boxes3d[keep_mask], scores=scores[keep_mask], labels=labels[keep_mask]) else: # no nms ret = dict(bboxes=boxes3d, scores=scores, labels=labels) ret_layer.append(ret) rets.append(ret_layer) assert len(rets) == 1 assert len(rets[0]) == 1 res = [[ img_metas[0]['box_type_3d'](rets[0][0]['bboxes'], box_dim=rets[0][0]['bboxes'].shape[-1]), rets[0][0]['scores'], rets[0][0]['labels'].int() ]] return res ================================================ FILE: mmdet3d/models/dense_heads/vote_head.py ================================================ import numpy as np import torch from mmcv.runner import force_fp32 from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.post_processing import aligned_3d_nms from mmdet3d.models.builder import build_loss from mmdet3d.models.losses import chamfer_distance from mmdet3d.models.model_utils import VoteModule from mmdet3d.ops import build_sa_module, furthest_point_sample from mmdet.core import build_bbox_coder, multi_apply from mmdet.models import HEADS from .base_conv_bbox_head import BaseConvBboxHead @HEADS.register_module() class VoteHead(nn.Module): r"""Bbox head of `Votenet `_. Args: num_classes (int): The number of class. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. pred_layer_cfg (dict): Config of classfication and regression prediction layers. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_class_loss (dict): Config of size classification loss. size_res_loss (dict): Config of size residual regression loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. """ def __init__(self, num_classes, bbox_coder, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, pred_layer_cfg=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, semantic_loss=None, iou_loss=None): super(VoteHead, self).__init__() self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = vote_module_cfg['gt_per_seed'] self.num_proposal = vote_aggregation_cfg['num_point'] self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_res_loss = build_loss(dir_res_loss) self.dir_class_loss = build_loss(dir_class_loss) self.size_res_loss = build_loss(size_res_loss) if size_class_loss is not None: self.size_class_loss = build_loss(size_class_loss) if semantic_loss is not None: self.semantic_loss = build_loss(semantic_loss) if iou_loss is not None: self.iou_loss = build_loss(iou_loss) else: self.iou_loss = None self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins self.vote_module = VoteModule(**vote_module_cfg) self.vote_aggregation = build_sa_module(vote_aggregation_cfg) self.fp16_enabled = False # Bbox classification and regression self.conv_pred = BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels()) def init_weights(self): """Initialize weights of VoteHead.""" pass def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (2) return self.num_classes + 2 def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # Objectness scores (2), center residual (3), # heading class+residual (num_dir_bins*2), # size class+residual(num_sizes*4) return 3 + self.num_dir_bins * 2 + self.num_sizes * 4 def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. torch.Tensor: Indices of input points. """ # for imvotenet if 'seed_points' in feat_dict and \ 'seed_features' in feat_dict and \ 'seed_indices' in feat_dict: seed_points = feat_dict['seed_points'] seed_features = feat_dict['seed_features'] seed_indices = feat_dict['seed_indices'] # for votenet else: seed_points = feat_dict['fp_xyz'][-1] seed_features = feat_dict['fp_features'][-1] seed_indices = feat_dict['fp_indices'][-1] return seed_points, seed_features, seed_indices def forward(self, feat_dict, sample_mod): """Forward pass. Note: The forward of VoteHead is devided into 4 steps: 1. Generate vote_points from seed_points. 2. Aggregate vote_points. 3. Predict bbox and score. 4. Decode predictions. Args: feat_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed", "random" and "spec". Returns: dict: Predictions of vote head. """ assert sample_mod in ['vote', 'seed', 'random', 'spec'] seed_points, seed_features, seed_indices = self._extract_input( feat_dict) # 1. generate vote_points from seed_points vote_points, vote_features, vote_offset = self.vote_module( seed_points, seed_features) results = dict( seed_points=seed_points, seed_indices=seed_indices, vote_points=vote_points, vote_features=vote_features, vote_offset=vote_offset) # 2. aggregate vote_points if sample_mod == 'vote': # use fps in vote_aggregation aggregation_inputs = dict( points_xyz=vote_points, features=vote_features) elif sample_mod == 'seed': # FPS on seed and choose the votes corresponding to the seeds sample_indices = furthest_point_sample(seed_points, self.num_proposal) aggregation_inputs = dict( points_xyz=vote_points, features=vote_features, indices=sample_indices) elif sample_mod == 'random': # Random sampling from the votes batch_size, num_seed = seed_points.shape[:2] sample_indices = seed_points.new_tensor( torch.randint(0, num_seed, (batch_size, self.num_proposal)), dtype=torch.int32) aggregation_inputs = dict( points_xyz=vote_points, features=vote_features, indices=sample_indices) elif sample_mod == 'spec': # Specify the new center in vote_aggregation aggregation_inputs = dict( points_xyz=seed_points, features=seed_features, target_xyz=vote_points) else: raise NotImplementedError( f'Sample mode {sample_mod} is not supported!') vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs) aggregated_points, features, aggregated_indices = vote_aggregation_ret results['aggregated_points'] = aggregated_points results['aggregated_features'] = features results['aggregated_indices'] = aggregated_indices # 3. predict bbox and score cls_predictions, reg_predictions = self.conv_pred(features) # 4. decode predictions decode_res = self.bbox_coder.split_pred(cls_predictions, reg_predictions, aggregated_points) results.update(decode_res) return results @force_fp32(apply_to=('bbox_preds', )) def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None, ret_target=False): """Compute loss. Args: bbox_preds (dict): Predictions from forward of vote head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. ret_target (Bool): Return targets or not. Returns: dict: Losses of Votenet. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) = targets # calculate vote loss vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'], bbox_preds['vote_points'], bbox_preds['seed_indices'], vote_target_masks, vote_targets) # calculate objectness loss objectness_loss = self.objectness_loss( bbox_preds['obj_scores'].transpose(2, 1), objectness_targets, weight=objectness_weights) # calculate center loss source2target_loss, target2source_loss = self.center_loss( bbox_preds['center'], center_targets, src_weight=box_loss_weights, dst_weight=valid_gt_weights) center_loss = source2target_loss + target2source_loss # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class'].transpose(2, 1), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss batch_size, proposal_num = size_class_targets.shape[:2] heading_label_one_hot = vote_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) dir_res_norm = torch.sum( bbox_preds['dir_res_norm'] * heading_label_one_hot, -1) dir_res_loss = self.dir_res_loss( dir_res_norm, dir_res_targets, weight=box_loss_weights) # calculate size class loss size_class_loss = self.size_class_loss( bbox_preds['size_class'].transpose(2, 1), size_class_targets, weight=box_loss_weights) # calculate size residual loss one_hot_size_targets = vote_targets.new_zeros( (batch_size, proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets_expand = one_hot_size_targets.unsqueeze( -1).repeat(1, 1, 1, 3).contiguous() size_residual_norm = torch.sum( bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2) box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat( 1, 1, 3) size_res_loss = self.size_res_loss( size_residual_norm, size_res_targets, weight=box_loss_weights_expand) # calculate semantic loss semantic_loss = self.semantic_loss( bbox_preds['sem_scores'].transpose(2, 1), mask_targets, weight=box_loss_weights) losses = dict( vote_loss=vote_loss, objectness_loss=objectness_loss, semantic_loss=semantic_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=size_class_loss, size_res_loss=size_res_loss) if self.iou_loss: corners_pred = self.bbox_coder.decode_corners( bbox_preds['center'], size_residual_norm, one_hot_size_targets_expand) corners_target = self.bbox_coder.decode_corners( assigned_center_targets, size_res_targets, one_hot_size_targets_expand) iou_loss = self.iou_loss( corners_pred, corners_target, weight=box_loss_weights) losses['iou_loss'] = iou_loss if ret_target: losses['targets'] = targets return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of vote head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. Returns: tuple[torch.Tensor]: Targets of vote head. """ # find empty example valid_gt_masks = list() gt_num = list() for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) valid_gt_masks.append(gt_labels_3d[index].new_zeros(1)) gt_num.append(1) else: valid_gt_masks.append(gt_labels_3d[index].new_ones( gt_labels_3d[index].shape)) gt_num.append(gt_labels_3d[index].shape[0]) max_gt_num = max(gt_num) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, objectness_targets, objectness_masks) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points) # pad targets as original code of votenet. for index in range(len(gt_labels_3d)): pad_num = max_gt_num - gt_labels_3d[index].shape[0] center_targets[index] = F.pad(center_targets[index], (0, 0, 0, pad_num)) valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num)) vote_targets = torch.stack(vote_targets) vote_target_masks = torch.stack(vote_target_masks) center_targets = torch.stack(center_targets) valid_gt_masks = torch.stack(valid_gt_masks) assigned_center_targets = torch.stack(assigned_center_targets) objectness_targets = torch.stack(objectness_targets) objectness_weights = torch.stack(objectness_masks) objectness_weights /= (torch.sum(objectness_weights) + 1e-6) box_loss_weights = objectness_targets.float() / ( torch.sum(objectness_targets).float() + 1e-6) valid_gt_weights = valid_gt_masks.float() / ( torch.sum(valid_gt_masks.float()) + 1e-6) dir_class_targets = torch.stack(dir_class_targets) dir_res_targets = torch.stack(dir_res_targets) size_class_targets = torch.stack(size_class_targets) size_res_targets = torch.stack(size_res_targets) mask_targets = torch.stack(mask_targets) return (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None): """Generate targets of vote head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (None | torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (None | torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. Returns: tuple[torch.Tensor]: Targets of vote head. """ assert self.bbox_coder.with_rot or pts_semantic_mask is not None gt_bboxes_3d = gt_bboxes_3d.to(points.device) # generate votes target num_points = points.shape[0] if self.bbox_coder.with_rot: vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed]) vote_target_masks = points.new_zeros([num_points], dtype=torch.long) vote_target_idx = points.new_zeros([num_points], dtype=torch.long) box_indices_all = gt_bboxes_3d.points_in_boxes(points) for i in range(gt_labels_3d.shape[0]): box_indices = box_indices_all[:, i] indices = torch.nonzero( box_indices, as_tuple=False).squeeze(-1) selected_points = points[indices] vote_target_masks[indices] = 1 vote_targets_tmp = vote_targets[indices] votes = gt_bboxes_3d.gravity_center[i].unsqueeze( 0) - selected_points[:, :3] for j in range(self.gt_per_seed): column_indices = torch.nonzero( vote_target_idx[indices] == j, as_tuple=False).squeeze(-1) vote_targets_tmp[column_indices, int(j * 3):int(j * 3 + 3)] = votes[column_indices] if j == 0: vote_targets_tmp[column_indices] = votes[ column_indices].repeat(1, self.gt_per_seed) vote_targets[indices] = vote_targets_tmp vote_target_idx[indices] = torch.clamp( vote_target_idx[indices] + 1, max=2) elif pts_semantic_mask is not None: vote_targets = points.new_zeros([num_points, 3]) vote_target_masks = points.new_zeros([num_points], dtype=torch.long) for i in torch.unique(pts_instance_mask): indices = torch.nonzero( pts_instance_mask == i, as_tuple=False).squeeze(-1) if pts_semantic_mask[indices[0]] < self.num_classes: selected_points = points[indices, :3] center = 0.5 * ( selected_points.min(0)[0] + selected_points.max(0)[0]) vote_targets[indices, :] = center - selected_points vote_target_masks[indices] = 1 vote_targets = vote_targets.repeat((1, self.gt_per_seed)) else: raise NotImplementedError (center_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d) proposal_num = aggregated_points.shape[0] distance1, _, assignment, _ = chamfer_distance( aggregated_points.unsqueeze(0), center_targets.unsqueeze(0), reduction='none') assignment = assignment.squeeze(0) euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6) objectness_targets = points.new_zeros((proposal_num), dtype=torch.long) objectness_targets[ euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1 objectness_masks = points.new_zeros((proposal_num)) objectness_masks[ euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0 objectness_masks[ euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0 dir_class_targets = dir_class_targets[assignment] dir_res_targets = dir_res_targets[assignment] dir_res_targets /= (np.pi / self.num_dir_bins) size_class_targets = size_class_targets[assignment] size_res_targets = size_res_targets[assignment] one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros( (proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat( 1, 1, 3) mean_sizes = size_res_targets.new_tensor( self.bbox_coder.mean_sizes).unsqueeze(0) pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1) size_res_targets /= pos_mean_sizes mask_targets = gt_labels_3d[assignment] assigned_center_targets = center_targets[assignment] return (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets.long(), objectness_targets, objectness_masks) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False, use_nms=True): """Generate bboxes from vote head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from vote head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. use_nms (bool): Whether to apply NMS, skip nms postprocessing while using vote head in rpn stage. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1] sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1) bbox3d = self.bbox_coder.decode(bbox_preds) if use_nms: batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = \ self.multiclass_nms_single(obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected, box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results else: return bbox3d def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) box_indices = bbox.points_in_boxes(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] nonempty_box_mask = box_indices.T.sum(1) > 5 bbox_classes = torch.argmax(sem_scores, -1) nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_thr) # filter empty boxes and boxes with low score scores_mask = (obj_scores > self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected] * sem_scores[selected][:, k]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels ================================================ FILE: mmdet3d/models/detectors/__init__.py ================================================ from .base import Base3DDetector from .centerpoint import CenterPoint from .dynamic_voxelnet import DynamicVoxelNet from .h3dnet import H3DNet from .imvotenet import ImVoteNet from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_two_stage import MVXTwoStageDetector from .parta2 import PartA2 from .ssd3dnet import SSD3DNet from .votenet import VoteNet from .voxelnet import VoxelNet from .transfusion import TransFusionDetector from .sparsefusion import SparseFusionDetector __all__ = [ 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'TransFusionDetector', 'SparseFusionDetector', ] ================================================ FILE: mmdet3d/models/detectors/base.py ================================================ import mmcv import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import auto_fp16 from os import path as osp from mmdet3d.core import Box3DMode, Coord3DMode, show_result from mmdet.models.detectors import BaseDetector class Base3DDetector(BaseDetector): """Base class for detectors.""" def forward_test(self, points, img_metas, img=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(points) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(points), len(img_metas))) if num_augs == 1: img = [img] if img is None else img return self.simple_test(points[0], img_metas[0], img[0], **kwargs) else: return self.aug_test(points, img_metas, img, **kwargs) @auto_fp16(apply_to=('img', 'points')) def forward(self, return_loss=True, **kwargs): """Calls either forward_train or forward_test depending on whether return_loss=True. Note this setting will change the expected inputs. When `return_loss=True`, img and img_metas are single-nested (i.e. torch.Tensor and list[dict]), and when `resturn_loss=False`, img and img_metas should be double nested (i.e. list[torch.Tensor], list[list[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(**kwargs) else: return self.forward_test(**kwargs) def show_results(self, data, result, out_dir): """Results visualization. Args: data (list[dict]): Input points and the information of the sample. result (list[dict]): Prediction results. out_dir (str): Output directory of visualization result. """ for batch_id in range(len(result)): if isinstance(data['points'][0], DC): points = data['points'][0]._data[0][batch_id].numpy() elif mmcv.is_list_of(data['points'][0], torch.Tensor): points = data['points'][0][batch_id] else: ValueError(f"Unsupported data type {type(data['points'][0])} " f'for visualization!') if isinstance(data['img_metas'][0], DC): pts_filename = data['img_metas'][0]._data[0][batch_id][ 'pts_filename'] box_mode_3d = data['img_metas'][0]._data[0][batch_id][ 'box_mode_3d'] elif mmcv.is_list_of(data['img_metas'][0], dict): pts_filename = data['img_metas'][0][batch_id]['pts_filename'] box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') file_name = osp.split(pts_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' pred_bboxes = result[batch_id]['boxes_3d'] # for now we convert points and bbox into depth mode if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d == Box3DMode.LIDAR): points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d, Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( f'Unsupported box_mode_3d {box_mode_3d} for convertion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result(points, None, pred_bboxes, out_dir, file_name) ================================================ FILE: mmdet3d/models/detectors/centerpoint.py ================================================ import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet.models import DETECTORS from .mvx_two_stage import MVXTwoStageDetector @DETECTORS.register_module() class CenterPoint(MVXTwoStageDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, pts_voxel_layer=None, pts_voxel_encoder=None, pts_middle_encoder=None, pts_fusion_layer=None, img_backbone=None, pts_backbone=None, img_neck=None, pts_neck=None, pts_bbox_head=None, img_roi_head=None, img_rpn_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(CenterPoint, self).__init__(pts_voxel_layer, pts_voxel_encoder, pts_middle_encoder, pts_fusion_layer, img_backbone, pts_backbone, img_neck, pts_neck, pts_bbox_head, img_roi_head, img_rpn_head, train_cfg, test_cfg, pretrained) def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x def forward_pts_train(self, pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats) loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] losses = self.pts_bbox_head.loss(*loss_inputs) return losses def simple_test_pts(self, x, img_metas, rescale=False): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x) bbox_list = self.pts_bbox_head.get_bboxes( outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test_pts(self, feats, img_metas, rescale=False): """Test function of point cloud branch with augmentaiton. The function implementation process is as follows: - step 1: map features back for double-flip augmentation. - step 2: merge all features and generate boxes. - step 3: map boxes back for scale augmentation. - step 4: merge results. Args: feats (list[torch.Tensor]): Feature of point cloud. img_metas (list[dict]): Meta information of samples. rescale (bool): Whether to rescale bboxes. Default: False. Returns: dict: Returned bboxes consists of the following keys: - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes. - scores_3d (torch.Tensor): Scores of predicted boxes. - labels_3d (torch.Tensor): Labels of predicted boxes. """ # only support aug_test for one sample outs_list = [] for x, img_meta in zip(feats, img_metas): outs = self.pts_bbox_head(x) # merge augmented outputs before decoding bboxes for task_id, out in enumerate(outs): for key in out[0].keys(): if img_meta[0]['pcd_horizontal_flip']: outs[task_id][0][key] = torch.flip( outs[task_id][0][key], dims=[2]) if key == 'reg': outs[task_id][0][key][:, 1, ...] = 1 - outs[ task_id][0][key][:, 1, ...] elif key == 'rot': outs[task_id][0][ key][:, 1, ...] = -outs[task_id][0][key][:, 1, ...] elif key == 'vel': outs[task_id][0][ key][:, 1, ...] = -outs[task_id][0][key][:, 1, ...] if img_meta[0]['pcd_vertical_flip']: outs[task_id][0][key] = torch.flip( outs[task_id][0][key], dims=[3]) if key == 'reg': outs[task_id][0][key][:, 0, ...] = 1 - outs[ task_id][0][key][:, 0, ...] elif key == 'rot': outs[task_id][0][ key][:, 0, ...] = -outs[task_id][0][key][:, 0, ...] elif key == 'vel': outs[task_id][0][ key][:, 0, ...] = -outs[task_id][0][key][:, 0, ...] outs_list.append(outs) preds_dicts = dict() scale_img_metas = [] # concat outputs sharing the same pcd_scale_factor for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)): pcd_scale_factor = img_meta[0]['pcd_scale_factor'] if pcd_scale_factor not in preds_dicts.keys(): preds_dicts[pcd_scale_factor] = outs scale_img_metas.append(img_meta) else: for task_id, out in enumerate(outs): for key in out[0].keys(): preds_dicts[pcd_scale_factor][task_id][0][key] += out[ 0][key] aug_bboxes = [] for pcd_scale_factor, preds_dict in preds_dicts.items(): for task_id, pred_dict in enumerate(preds_dict): # merge outputs with different flips before decoding bboxes for key in pred_dict[0].keys(): preds_dict[task_id][0][key] /= len(outs_list) / len( preds_dicts.keys()) bbox_list = self.pts_bbox_head.get_bboxes( preds_dict, img_metas[0], rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) if len(preds_dicts.keys()) > 1: # merge outputs with different scales after decoding bboxes merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas, self.pts_bbox_head.test_cfg) return merged_bboxes else: for key in bbox_list[0].keys(): bbox_list[0][key] = bbox_list[0][key].to('cpu') return bbox_list[0] def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) bbox_list = dict() if pts_feats and self.with_pts_bbox: pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale) bbox_list.update(pts_bbox=pts_bbox) return [bbox_list] ================================================ FILE: mmdet3d/models/detectors/dynamic_voxelnet.py ================================================ import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet.models import DETECTORS from .voxelnet import VoxelNet @DETECTORS.register_module() class DynamicVoxelNet(VoxelNet): r"""VoxelNet using `dynamic voxelization `_. """ def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(DynamicVoxelNet, self).__init__( voxel_layer=voxel_layer, voxel_encoder=voxel_encoder, middle_encoder=middle_encoder, backbone=backbone, neck=neck, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, ) def extract_feat(self, points, img_metas): """Extract features from points.""" voxels, coors = self.voxelize(points) voxel_features, feature_coors = self.voxel_encoder(voxels, coors) batch_size = coors[-1, 0].item() + 1 x = self.middle_encoder(voxel_features, feature_coors, batch_size) x = self.backbone(x) if self.with_neck: x = self.neck(x) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points and coordinates. """ coors = [] # dynamic voxelization only provide a coors mapping for res in points: res_coors = self.voxel_layer(res) coors.append(res_coors) points = torch.cat(points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return points, coors_batch ================================================ FILE: mmdet3d/models/detectors/h3dnet.py ================================================ import torch from mmdet3d.core import merge_aug_bboxes_3d from mmdet.models import DETECTORS from .two_stage import TwoStage3DDetector @DETECTORS.register_module() class H3DNet(TwoStage3DDetector): r"""H3DNet model. Please refer to the `paper `_ """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(H3DNet, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, gt_bboxes_ignore=None): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): point-wise instance label of each batch. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: Losses. """ points_cat = torch.stack(points) feats_dict = self.extract_feat(points_cat) feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]] feats_dict['fp_features'] = [feats_dict['hd_feature']] feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]] losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(feats_dict, self.train_cfg.rpn.sample_mod) feats_dict.update(rpn_outs) rpn_loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) rpn_losses = self.rpn_head.loss( rpn_outs, *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, ret_target=True) feats_dict['targets'] = rpn_losses.pop('targets') losses.update(rpn_losses) # Generate rpn proposals proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = (points, rpn_outs, img_metas) proposal_list = self.rpn_head.get_bboxes( *proposal_inputs, use_nms=proposal_cfg.use_nms) feats_dict['proposal_list'] = proposal_list else: raise NotImplementedError roi_losses = self.roi_head.forward_train(feats_dict, img_metas, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, gt_bboxes_ignore) losses.update(roi_losses) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list): Image metas. rescale (bool): Whether to rescale results. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) feats_dict = self.extract_feat(points_cat) feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]] feats_dict['fp_features'] = [feats_dict['hd_feature']] feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]] if self.with_rpn: proposal_cfg = self.test_cfg.rpn rpn_outs = self.rpn_head(feats_dict, proposal_cfg.sample_mod) feats_dict.update(rpn_outs) # Generate rpn proposals proposal_list = self.rpn_head.get_bboxes( points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms) feats_dict['proposal_list'] = proposal_list else: raise NotImplementedError return self.roi_head.simple_test( feats_dict, img_metas, points_cat, rescale=rescale) def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test with augmentation.""" points_cat = [torch.stack(pts) for pts in points] feats_dict = self.extract_feats(points_cat, img_metas) for feat_dict in feats_dict: feat_dict['fp_xyz'] = [feat_dict['fp_xyz_net0'][-1]] feat_dict['fp_features'] = [feat_dict['hd_feature']] feat_dict['fp_indices'] = [feat_dict['fp_indices_net0'][-1]] # only support aug_test for one sample aug_bboxes = [] for feat_dict, pts_cat, img_meta in zip(feats_dict, points_cat, img_metas): if self.with_rpn: proposal_cfg = self.test_cfg.rpn rpn_outs = self.rpn_head(feat_dict, proposal_cfg.sample_mod) feat_dict.update(rpn_outs) # Generate rpn proposals proposal_list = self.rpn_head.get_bboxes( points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms) feat_dict['proposal_list'] = proposal_list else: raise NotImplementedError bbox_results = self.roi_head.simple_test( feat_dict, self.test_cfg.rcnn.sample_mod, img_meta, pts_cat, rescale=rescale) aug_bboxes.append(bbox_results) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] def extract_feats(self, points, img_metas): """Extract features of multiple samples.""" return [ self.extract_feat(pts, img_meta) for pts, img_meta in zip(points, img_metas) ] ================================================ FILE: mmdet3d/models/detectors/imvotenet.py ================================================ import numpy as np import torch from torch import nn as nn from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.utils import MLP from mmdet.models import DETECTORS from .. import builder from .base import Base3DDetector def sample_valid_seeds(mask, num_sampled_seed=1024): """Randomly sample seeds from all imvotes. Args: mask (torch.Tensor): Bool tensor in shape ( seed_num*max_imvote_per_pixel), indicates whether this imvote corresponds to a 2D bbox. num_sampled_seed (int): How many to sample from all imvotes. Returns: torch.Tensor: Indices with shape (num_sampled_seed). """ device = mask.device batch_size = mask.shape[0] sample_inds = mask.new_zeros((batch_size, num_sampled_seed), dtype=torch.int64) for bidx in range(batch_size): # return index of non zero elements valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1) if len(valid_inds) < num_sampled_seed: # compute set t1 - t2 t1 = torch.arange(num_sampled_seed, device=device) t2 = valid_inds % num_sampled_seed combined = torch.cat((t1, t2)) uniques, counts = combined.unique(return_counts=True) difference = uniques[counts == 1] rand_inds = torch.randperm( len(difference), device=device)[:num_sampled_seed - len(valid_inds)] cur_sample_inds = difference[rand_inds] cur_sample_inds = torch.cat((valid_inds, cur_sample_inds)) else: rand_inds = torch.randperm( len(valid_inds), device=device)[:num_sampled_seed] cur_sample_inds = valid_inds[rand_inds] sample_inds[bidx, :] = cur_sample_inds return sample_inds @DETECTORS.register_module() class ImVoteNet(Base3DDetector): r"""`ImVoteNet `_ for 3D detection.""" def __init__(self, pts_backbone=None, pts_bbox_heads=None, pts_neck=None, img_backbone=None, img_neck=None, img_roi_head=None, img_rpn_head=None, img_mlp=None, freeze_img_branch=False, fusion_layer=None, num_sampled_seed=None, train_cfg=None, test_cfg=None, pretrained=None): super(ImVoteNet, self).__init__() # point branch if pts_backbone is not None: self.pts_backbone = builder.build_backbone(pts_backbone) if pts_neck is not None: self.pts_neck = builder.build_neck(pts_neck) if pts_bbox_heads is not None: pts_bbox_head_common = pts_bbox_heads.common pts_bbox_head_common.update( train_cfg=train_cfg.pts if train_cfg is not None else None) pts_bbox_head_common.update(test_cfg=test_cfg.pts) pts_bbox_head_joint = pts_bbox_head_common.copy() pts_bbox_head_joint.update(pts_bbox_heads.joint) pts_bbox_head_pts = pts_bbox_head_common.copy() pts_bbox_head_pts.update(pts_bbox_heads.pts) pts_bbox_head_img = pts_bbox_head_common.copy() pts_bbox_head_img.update(pts_bbox_heads.img) self.pts_bbox_head_joint = builder.build_head(pts_bbox_head_joint) self.pts_bbox_head_pts = builder.build_head(pts_bbox_head_pts) self.pts_bbox_head_img = builder.build_head(pts_bbox_head_img) self.pts_bbox_heads = [ self.pts_bbox_head_joint, self.pts_bbox_head_pts, self.pts_bbox_head_img ] self.loss_weights = pts_bbox_heads.loss_weights # image branch if img_backbone: self.img_backbone = builder.build_backbone(img_backbone) if img_neck is not None: self.img_neck = builder.build_neck(img_neck) if img_rpn_head is not None: rpn_train_cfg = train_cfg.img_rpn if train_cfg \ is not None else None img_rpn_head_ = img_rpn_head.copy() img_rpn_head_.update( train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn) self.img_rpn_head = builder.build_head(img_rpn_head_) if img_roi_head is not None: rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \ is not None else None img_roi_head.update( train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn) self.img_roi_head = builder.build_head(img_roi_head) # fusion if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel self.freeze_img_branch = freeze_img_branch if freeze_img_branch: self.freeze_img_branch_params() if img_mlp is not None: self.img_mlp = MLP(**img_mlp) self.num_sampled_seed = num_sampled_seed self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) def init_weights(self, pretrained=None): """Initialize model weights.""" super(ImVoteNet, self).init_weights(pretrained) if pretrained is None: img_pretrained = None pts_pretrained = None elif isinstance(pretrained, dict): img_pretrained = pretrained.get('img', None) pts_pretrained = pretrained.get('pts', None) else: raise ValueError( f'pretrained should be a dict, got {type(pretrained)}') if self.with_img_backbone: self.img_backbone.init_weights(pretrained=img_pretrained) if self.with_img_neck: if isinstance(self.img_neck, nn.Sequential): for m in self.img_neck: m.init_weights() else: self.img_neck.init_weights() if self.with_img_roi_head: self.img_roi_head.init_weights(img_pretrained) if self.with_img_rpn: self.img_rpn_head.init_weights() if self.with_pts_backbone: self.pts_backbone.init_weights(pretrained=pts_pretrained) if self.with_pts_bbox: self.pts_bbox_head.init_weights() if self.with_pts_neck: if isinstance(self.pts_neck, nn.Sequential): for m in self.pts_neck: m.init_weights() else: self.pts_neck.init_weights() def freeze_img_branch_params(self): """Freeze all image branch parameters.""" if self.with_img_bbox_head: for param in self.img_bbox_head.parameters(): param.requires_grad = False if self.with_img_backbone: for param in self.img_backbone.parameters(): param.requires_grad = False if self.with_img_neck: for param in self.img_neck.parameters(): param.requires_grad = False if self.with_img_rpn: for param in self.img_rpn_head.parameters(): param.requires_grad = False if self.with_img_roi_head: for param in self.img_roi_head.parameters(): param.requires_grad = False def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """Overload in order to load img network ckpts into img branch.""" module_names = ['backbone', 'neck', 'roi_head', 'rpn_head'] for key in list(state_dict): for module_name in module_names: if key.startswith(module_name) and ('img_' + key) not in state_dict: state_dict['img_' + key] = state_dict.pop(key) super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) def train(self, mode=True): """Overload in order to keep image branch modules in eval mode.""" super(ImVoteNet, self).train(mode) if self.freeze_img_branch: if self.with_img_bbox_head: self.img_bbox_head.eval() if self.with_img_backbone: self.img_backbone.eval() if self.with_img_neck: self.img_neck.eval() if self.with_img_rpn: self.img_rpn_head.eval() if self.with_img_roi_head: self.img_roi_head.eval() @property def with_img_bbox(self): """bool: Whether the detector has a 2D image box head.""" return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox) or (hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None)) @property def with_img_bbox_head(self): """bool: Whether the detector has a 2D image box head (not roi).""" return hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None @property def with_img_backbone(self): """bool: Whether the detector has a 2D image backbone.""" return hasattr(self, 'img_backbone') and self.img_backbone is not None @property def with_img_neck(self): """bool: Whether the detector has a neck in image branch.""" return hasattr(self, 'img_neck') and self.img_neck is not None @property def with_img_rpn(self): """bool: Whether the detector has a 2D RPN in image detector branch.""" return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None @property def with_img_roi_head(self): """bool: Whether the detector has a RoI Head in image branch.""" return hasattr(self, 'img_roi_head') and self.img_roi_head is not None @property def with_pts_bbox(self): """bool: Whether the detector has a 3D box head.""" return hasattr(self, 'pts_bbox_head') and self.pts_bbox_head is not None @property def with_pts_backbone(self): """bool: Whether the detector has a 3D backbone.""" return hasattr(self, 'pts_backbone') and self.pts_backbone is not None @property def with_pts_neck(self): """bool: Whether the detector has a neck in 3D detector branch.""" return hasattr(self, 'pts_neck') and self.pts_neck is not None def extract_feat(self, imgs): """Just to inherit from abstract method.""" pass def extract_img_feat(self, img): """Directly extract features from the img backbone+neck.""" x = self.img_backbone(img) if self.with_img_neck: x = self.img_neck(x) return x def extract_img_feats(self, imgs): """Extract features from multiple images. Args: imgs (list[torch.Tensor]): A list of images. The images are augmented from the same image but in different ways. Returns: list[torch.Tensor]: Features of different images """ assert isinstance(imgs, list) return [self.extract_img_feat(img) for img in imgs] def extract_pts_feat(self, pts): """Extract features of points.""" x = self.pts_backbone(pts) if self.with_pts_neck: x = self.pts_neck(x) seed_points = x['fp_xyz'][-1] seed_features = x['fp_features'][-1] seed_indices = x['fp_indices'][-1] return (seed_points, seed_features, seed_indices) def extract_pts_feats(self, pts): """Extract features of points from multiple samples.""" assert isinstance(pts, list) return [self.extract_pts_feat(pt) for pt in pts] @torch.no_grad() def extract_bboxes_2d(self, img, img_metas, train=True, bboxes_2d=None, **kwargs): """Extract bounding boxes from 2d detector. Args: img (torch.Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): Image meta info. train (bool): train-time or not. bboxes_2d (list[torch.Tensor]): provided 2d bboxes, not supported yet. Return: list[torch.Tensor]: a list of processed 2d bounding boxes. """ if bboxes_2d is None: x = self.extract_img_feat(img) proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas) rets = self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=False) rets_processed = [] for ret in rets: tmp = np.concatenate(ret, axis=0) sem_class = img.new_zeros((len(tmp))) start = 0 for i, bboxes in enumerate(ret): sem_class[start:start + len(bboxes)] = i start += len(bboxes) ret = img.new_tensor(tmp) # append class index ret = torch.cat([ret, sem_class[:, None]], dim=-1) inds = torch.argsort(ret[:, 4], descending=True) ret = ret.index_select(0, inds) # drop half bboxes during training for better generalization if train: rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2] rand_drop = torch.sort(rand_drop)[0] ret = ret[rand_drop] rets_processed.append(ret.float()) return rets_processed else: rets_processed = [] for ret in bboxes_2d: if len(ret) > 0 and train: rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2] rand_drop = torch.sort(rand_drop)[0] ret = ret[rand_drop] rets_processed.append(ret.float()) return rets_processed def forward_train(self, points=None, img=None, img_metas=None, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, proposals=None, calib=None, bboxes_2d=None, gt_bboxes_3d=None, gt_labels_3d=None, pts_semantic_mask=None, pts_instance_mask=None, **kwargs): """Forwarding of train for image branch pretrain or stage 2 train. Args: points (list[torch.Tensor]): Points of each batch. img (torch.Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image and point cloud meta info dict. For example, keys include 'ori_shape', 'img_norm_cfg', and 'transformation_3d_flow'. For details on the values of the keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[torch.Tensor]): class indices for each 2d bounding box. gt_bboxes_ignore (None | list[torch.Tensor]): specify which 2d bounding boxes can be ignored when computing the loss. gt_masks (None | torch.Tensor): true segmentation masks for each 2d bbox, used if the architecture supports a segmentation task. proposals: override rpn proposals (2d) with custom proposals. Use when `with_rpn` is False. calib (dict[str, torch.Tensor]): camera calibration matrices, Rt and K. bboxes_2d (list[torch.Tensor]): provided 2d bboxes, not supported yet. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes. gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes. pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): point-wise instance label of each batch. Returns: dict[str, torch.Tensor]: a dictionary of loss components. """ if points is None: x = self.extract_img_feat(img) losses = dict() # RPN forward and loss if self.with_img_rpn: proposal_cfg = self.train_cfg.get('img_rpn_proposal', self.test_cfg.img_rpn) rpn_losses, proposal_list = self.img_rpn_head.forward_train( x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=gt_bboxes_ignore, proposal_cfg=proposal_cfg) losses.update(rpn_losses) else: proposal_list = proposals roi_losses = self.img_roi_head.forward_train( x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, gt_masks, **kwargs) losses.update(roi_losses) return losses else: bboxes_2d = self.extract_bboxes_2d( img, img_metas, bboxes_2d=bboxes_2d, **kwargs) points = torch.stack(points) seeds_3d, seed_3d_features, seed_indices = \ self.extract_pts_feat(points) img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, img_metas, calib) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict_joint = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) feat_dict_pts = dict( seed_points=seeds_3d, seed_features=seed_3d_features, seed_indices=seed_indices) feat_dict_img = dict( seed_points=seeds_3d, seed_features=img_features, seed_indices=seed_indices) loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) bbox_preds_joints = self.pts_bbox_head_joint( feat_dict_joint, self.train_cfg.pts.sample_mod) bbox_preds_pts = self.pts_bbox_head_pts( feat_dict_pts, self.train_cfg.pts.sample_mod) bbox_preds_img = self.pts_bbox_head_img( feat_dict_img, self.train_cfg.pts.sample_mod) losses_towers = [] losses_joint = self.pts_bbox_head_joint.loss( bbox_preds_joints, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_pts = self.pts_bbox_head_pts.loss( bbox_preds_pts, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_img = self.pts_bbox_head_img.loss( bbox_preds_img, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_towers.append(losses_joint) losses_towers.append(losses_pts) losses_towers.append(losses_img) combined_losses = dict() for loss_term in losses_joint: if 'loss' in loss_term: combined_losses[loss_term] = 0 for i in range(len(losses_towers)): combined_losses[loss_term] += \ losses_towers[i][loss_term] * \ self.loss_weights[i] else: # only save the metric of the joint head # if it is not a loss combined_losses[loss_term] = \ losses_towers[0][loss_term] return combined_losses def forward_test(self, points=None, img_metas=None, img=None, calib=None, bboxes_2d=None, **kwargs): """Forwarding of test for image branch pretrain or stage 2 train. Args: points (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and the inner list contains all points in the batch, where each Tensor should have a shape NxC. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. img (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. calibs (list[dict[str, torch.Tensor]], optional): camera calibration matrices, Rt and K. List indicates test-time augs. Defaults to None. bboxes_2d (list[list[torch.Tensor]], optional): Provided 2d bboxes, not supported yet. Defaults to None. Returns: list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes. """ if points is None: for var, name in [(img, 'img'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError( f'{name} must be a list, but got {type(var)}') num_augs = len(img) if num_augs != len(img_metas): raise ValueError(f'num of augmentations ({len(img)}) ' f'!= num of image meta ({len(img_metas)})') if num_augs == 1: # proposals (List[List[Tensor]]): the outer list indicates # test-time augs (multiscale, flip, etc.) and the inner list # indicates images in a batch. # The Tensor should have a shape Px4, where P is the number of # proposals. if 'proposals' in kwargs: kwargs['proposals'] = kwargs['proposals'][0] return self.simple_test_img_only( img=img[0], img_metas=img_metas[0], **kwargs) else: assert img[0].size(0) == 1, 'aug test does not support ' \ 'inference with batch size ' \ f'{img[0].size(0)}' # TODO: support test augmentation for predefined proposals assert 'proposals' not in kwargs return self.aug_test_img_only( img=img, img_metas=img_metas, **kwargs) else: for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(points) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'. format(len(points), len(img_metas))) if num_augs == 1: return self.simple_test( points[0], img_metas[0], img[0], calibs=calib[0], bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None, **kwargs) else: return self.aug_test(points, img_metas, img, calib, bboxes_2d, **kwargs) def simple_test_img_only(self, img, img_metas, proposals=None, rescale=False): """Test without augmentation, image network pretrain. May refer to https://github.com/open- mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py # noqa. Args: img (torch.Tensor): Should have a shape NxCxHxW, which contains all images in the batch. img_metas (list[dict]): proposals (list[Tensor], optional): override rpn proposals with custom proposals. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes to the original shape of input image. Defaults to False. Returns: list[list[torch.Tensor]]: Predicted 2d boxes. """ assert self.with_img_bbox, 'Img bbox head must be implemented.' assert self.with_img_backbone, 'Img backbone must be implemented.' assert self.with_img_rpn, 'Img rpn must be implemented.' assert self.with_img_roi_head, 'Img roi head must be implemented.' x = self.extract_img_feat(img) if proposals is None: proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas) else: proposal_list = proposals ret = self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=rescale) return ret def simple_test(self, points=None, img_metas=None, img=None, calibs=None, bboxes_2d=None, rescale=False, **kwargs): """Test without augmentation, stage 2. Args: points (list[torch.Tensor], optional): Elements in the list should have a shape NxC, the list indicates all point-clouds in the batch. Defaults to None. img_metas (list[dict], optional): List indicates images in a batch. Defaults to None. img (torch.Tensor, optional): Should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. calibs (dict[str, torch.Tensor], optional): camera calibration matrices, Rt and K. Defaults to None. bboxes_2d (list[torch.Tensor], optional): Provided 2d bboxes, not supported yet. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes. Defaults to False. Returns: list[dict]: Predicted 3d boxes. """ bboxes_2d = self.extract_bboxes_2d( img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs) points = torch.stack(points) seeds_3d, seed_3d_features, seed_indices = \ self.extract_pts_feat(points) img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, img_metas, calibs) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) bbox_preds = self.pts_bbox_head_joint(feat_dict, self.test_cfg.pts.sample_mod) bbox_list = self.pts_bbox_head_joint.get_bboxes( points, bbox_preds, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test_img_only(self, img, img_metas, rescale=False): """Test function with augmentation, image network pretrain. May refer to https://github.com/open- mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py # noqa. Args: img (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes to the original shape of input image. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. Defaults to None. Returns: list[list[torch.Tensor]]: Predicted 2d boxes. """ assert self.with_img_bbox, 'Img bbox head must be implemented.' assert self.with_img_backbone, 'Img backbone must be implemented.' assert self.with_img_rpn, 'Img rpn must be implemented.' assert self.with_img_roi_head, 'Img roi head must be implemented.' x = self.extract_img_feats(img) proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas) return self.img_roi_head.aug_test( x, proposal_list, img_metas, rescale=rescale) def aug_test(self, points=None, img_metas=None, imgs=None, calibs=None, bboxes_2d=None, rescale=False, **kwargs): """Test function with augmentation, stage 2. Args: points (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and the inner list contains all points in the batch, where each Tensor should have a shape NxC. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. imgs (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. calibs (list[dict[str, torch.Tensor]], optional): camera calibration matrices, Rt and K. List indicates test-time augs. Defaults to None. bboxes_2d (list[list[torch.Tensor]], optional): Provided 2d bboxes, not supported yet. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes. Defaults to False. Returns: list[dict]: Predicted 3d boxes. """ points_cat = [torch.stack(pts) for pts in points] feats = self.extract_pts_feats(points_cat, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, pts_cat, img_meta, bbox_2d, img, calib in zip( feats, points_cat, img_metas, bboxes_2d, imgs, calibs): bbox_2d = self.extract_bboxes_2d( img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs) seeds_3d, seed_3d_features, seed_indices = x img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d, img_metas, calib) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) bbox_preds = self.pts_bbox_head_joint(feat_dict, self.test_cfg.pts.sample_mod) bbox_list = self.pts_bbox_head_joint.get_bboxes( pts_cat, bbox_preds, img_metas, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/mvx_faster_rcnn.py ================================================ import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet.models import DETECTORS from .mvx_two_stage import MVXTwoStageDetector @DETECTORS.register_module() class MVXFasterRCNN(MVXTwoStageDetector): """Multi-modality VoxelNet using Faster R-CNN.""" def __init__(self, **kwargs): super(MVXFasterRCNN, self).__init__(**kwargs) @DETECTORS.register_module() class DynamicMVXFasterRCNN(MVXTwoStageDetector): """Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization.""" def __init__(self, **kwargs): super(DynamicMVXFasterRCNN, self).__init__(**kwargs) @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points and coordinates. """ coors = [] # dynamic voxelization only provide a coors mapping for res in points: res_coors = self.pts_voxel_layer(res) coors.append(res_coors) points = torch.cat(points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return points, coors_batch def extract_pts_feat(self, points, img_feats, img_metas): """Extract point features.""" if not self.with_pts_bbox: return None voxels, coors = self.voxelize(points) voxel_features, feature_coors = self.pts_voxel_encoder( voxels, coors, points, img_feats, img_metas) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x ================================================ FILE: mmdet3d/models/detectors/mvx_two_stage.py ================================================ import mmcv import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import force_fp32 from os import path as osp from torch import nn as nn from torch.nn import functional as F import time from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, merge_aug_bboxes_3d, show_result) from mmdet3d.ops import Voxelization from mmdet.core import multi_apply from mmdet.models import DETECTORS from .. import builder from .base import Base3DDetector @DETECTORS.register_module() class MVXTwoStageDetector(Base3DDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, freeze_img=True, freeze_img_head=False, pts_voxel_layer=None, pts_voxel_encoder=None, pts_middle_encoder=None, pts_fusion_layer=None, img_backbone=None, pts_backbone=None, img_neck=None, pts_neck=None, pts_bbox_head=None, img_roi_head=None, img_rpn_head=None, train_cfg=None, test_cfg=None, pretrained=None, ): super(MVXTwoStageDetector, self).__init__() self.freeze_img = freeze_img self.freeze_img_head = freeze_img_head if pts_voxel_layer: self.pts_voxel_layer = Voxelization(**pts_voxel_layer) if pts_voxel_encoder: self.pts_voxel_encoder = builder.build_voxel_encoder( pts_voxel_encoder) if pts_middle_encoder: self.pts_middle_encoder = builder.build_middle_encoder( pts_middle_encoder) if pts_backbone: self.pts_backbone = builder.build_backbone(pts_backbone) if pts_fusion_layer: self.pts_fusion_layer = builder.build_fusion_layer( pts_fusion_layer) if pts_neck is not None: self.pts_neck = builder.build_neck(pts_neck) if pts_bbox_head: pts_train_cfg = train_cfg.pts if train_cfg else None pts_bbox_head.update(train_cfg=pts_train_cfg) pts_test_cfg = test_cfg.pts if test_cfg else None pts_bbox_head.update(test_cfg=pts_test_cfg) self.pts_bbox_head = builder.build_head(pts_bbox_head) if img_backbone: self.img_backbone = builder.build_backbone(img_backbone) if img_neck is not None: self.img_neck = builder.build_neck(img_neck) if img_rpn_head is not None: self.img_rpn_head = builder.build_head(img_rpn_head) if img_roi_head is not None: self.img_roi_head = builder.build_head(img_roi_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) def init_weights(self, pretrained=None): """Initialize model weights.""" super(MVXTwoStageDetector, self).init_weights(pretrained) if pretrained is None: img_pretrained = None pts_pretrained = None elif isinstance(pretrained, dict): img_pretrained = pretrained.get('img', None) pts_pretrained = pretrained.get('pts', None) else: raise ValueError( f'pretrained should be a dict, got {type(pretrained)}') if self.with_img_backbone: self.img_backbone.init_weights(pretrained=img_pretrained) if self.with_pts_backbone: self.pts_backbone.init_weights(pretrained=pts_pretrained) if self.with_img_neck: if isinstance(self.img_neck, nn.Sequential): for m in self.img_neck: m.init_weights() else: self.img_neck.init_weights() if self.with_img_roi_head: self.img_roi_head.init_weights(img_pretrained) if self.with_img_rpn: self.img_rpn_head.init_weights() if self.with_pts_bbox: self.pts_bbox_head.init_weights() if self.with_pts_roi_head: self.pts_roi_head.init_weights() if self.freeze_img: if self.with_img_backbone: for param in self.img_backbone.parameters(): param.requires_grad = False if self.with_img_neck: for param in self.img_neck.parameters(): param.requires_grad = False @property def with_pts_roi_head(self): """bool: Whether the detector has a roi head in pts branch.""" return hasattr(self, 'pts_roi_head') and self.pts_roi_head is not None @property def with_img_shared_head(self): """bool: Whether the detector has a shared head in image branch.""" return hasattr(self, 'img_shared_head') and self.img_shared_head is not None @property def with_pts_bbox(self): """bool: Whether the detector has a 3D box head.""" return hasattr(self, 'pts_bbox_head') and self.pts_bbox_head is not None @property def with_img_bbox(self): """bool: Whether the detector has a 2D image box head.""" return hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None @property def with_img_backbone(self): """bool: Whether the detector has a 2D image backbone.""" return hasattr(self, 'img_backbone') and self.img_backbone is not None @property def with_pts_backbone(self): """bool: Whether the detector has a 3D backbone.""" return hasattr(self, 'pts_backbone') and self.pts_backbone is not None @property def with_fusion(self): """bool: Whether the detector has a fusion layer.""" return hasattr(self, 'pts_fusion_layer') and self.fusion_layer is not None @property def with_img_neck(self): """bool: Whether the detector has a neck in image branch.""" return hasattr(self, 'img_neck') and self.img_neck is not None @property def with_pts_neck(self): """bool: Whether the detector has a neck in 3D detector branch.""" return hasattr(self, 'pts_neck') and self.pts_neck is not None @property def with_img_rpn(self): """bool: Whether the detector has a 2D RPN in image detector branch.""" return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None @property def with_img_roi_head(self): """bool: Whether the detector has a RoI Head in image branch.""" return hasattr(self, 'img_roi_head') and self.img_roi_head is not None @property def with_voxel_encoder(self): """bool: Whether the detector has a voxel encoder.""" return hasattr(self, 'voxel_encoder') and self.voxel_encoder is not None @property def with_middle_encoder(self): """bool: Whether the detector has a middle encoder.""" return hasattr(self, 'middle_encoder') and self.middle_encoder is not None def extract_img_feat(self, img, img_metas): """Extract features of images.""" if self.with_img_backbone and img is not None: input_shape = img.shape[-2:] # update real input shape of each single img for img_meta in img_metas: img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: img.squeeze_(0) elif img.dim() == 5 and img.size(0) > 1: B, N, C, H, W = img.size() img = img.view(B * N, C, H, W) img_feats = self.img_backbone(img.float()) else: return None if self.with_img_neck: img_feats = self.img_neck(img_feats) return img_feats def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, ) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x def extract_feat(self, points, img, img_metas): """Extract features from images and points.""" img_feats = self.extract_img_feat(img, img_metas) pts_feats = self.extract_pts_feat(points, img_feats, img_metas) return (img_feats, pts_feats) @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points, number of points per voxel, and coordinates. """ voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) losses = dict() if pts_feats: losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) if img_feats: losses_img = self.forward_img_train( img_feats, img_metas=img_metas, gt_bboxes=gt_bboxes, gt_labels=gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, proposals=proposals) losses.update(losses_img) return losses def forward_pts_train(self, pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats, img_feats, img_metas) loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] losses = self.pts_bbox_head.loss(*loss_inputs) return losses def forward_img_train(self, x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, proposals=None, **kwargs): """Forward function for image branch. This function works similar to the forward function of Faster R-CNN. Args: x (list[torch.Tensor]): Image features of shape (B, C, H, W) of multiple levels. img_metas (list[dict]): Meta information of images. gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image sample. gt_labels (list[torch.Tensor]): Ground truth labels of boxes. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. proposals (list[torch.Tensor], optional): Proposals of each sample. Defaults to None. Returns: dict: Losses of each branch. """ losses = dict() # RPN forward and loss if self.with_img_rpn: rpn_outs = self.img_rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.img_rpn) rpn_losses = self.img_rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('img_rpn_proposal', self.test_cfg.img_rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # bbox head forward and loss if self.with_img_bbox: # bbox head forward and loss img_roi_losses = self.img_roi_head.forward_train( x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs) losses.update(img_roi_losses) return losses def simple_test_img(self, x, img_metas, proposals=None, rescale=False): """Test without augmentation.""" if proposals is None: proposal_list = self.simple_test_rpn(x, img_metas, self.test_cfg.img_rpn) else: proposal_list = proposals return self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=rescale) def simple_test_rpn(self, x, img_metas, rpn_test_cfg): """RPN test function.""" rpn_outs = self.img_rpn_head(x) proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg) proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) return proposal_list def simple_test_pts(self, x, x_img, img_metas, rescale=False): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x, x_img, img_metas) bbox_list = self.pts_bbox_head.get_bboxes( outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def simple_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) bbox_list = [dict() for i in range(len(img_metas))] if pts_feats and self.with_pts_bbox: bbox_pts = self.simple_test_pts( pts_feats, img_feats, img_metas, rescale=rescale) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox if img_feats and self.with_img_bbox: bbox_img = self.simple_test_img( img_feats, img_metas, rescale=rescale) for result_dict, img_bbox in zip(bbox_list, bbox_img): result_dict['img_bbox'] = img_bbox return bbox_list def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) bbox_list = dict() if pts_feats and self.with_pts_bbox: bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale) bbox_list.update(pts_bbox=bbox_pts) return [bbox_list] def extract_feats(self, points, img_metas, imgs=None): """Extract point and image features of multiple samples.""" if imgs is None: imgs = [None] * len(img_metas) img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs, img_metas) return img_feats, pts_feats def aug_test_pts(self, feats, img_metas, rescale=False): """Test function of point cloud branch with augmentaiton.""" # only support aug_test for one sample aug_bboxes = [] for x, img_meta in zip(feats, img_metas): outs = self.pts_bbox_head(x) bbox_list = self.pts_bbox_head.get_bboxes( *outs, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.pts_bbox_head.test_cfg) return merged_bboxes def show_results(self, data, result, out_dir): """Results visualization. Args: data (dict): Input points and the information of the sample. result (dict): Prediction results. out_dir (str): Output directory of visualization result. """ for batch_id in range(len(result)): if isinstance(data['points'][0], DC): points = data['points'][0]._data[0][batch_id].numpy() elif mmcv.is_list_of(data['points'][0], torch.Tensor): points = data['points'][0][batch_id] else: ValueError(f"Unsupported data type {type(data['points'][0])} " f'for visualization!') if isinstance(data['img_metas'][0], DC): pts_filename = data['img_metas'][0]._data[0][batch_id][ 'pts_filename'] box_mode_3d = data['img_metas'][0]._data[0][batch_id][ 'box_mode_3d'] elif mmcv.is_list_of(data['img_metas'][0], dict): pts_filename = data['img_metas'][0][batch_id]['pts_filename'] box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') file_name = osp.split(pts_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1 pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds] # for now we convert points and bbox into depth mode if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d == Box3DMode.LIDAR): points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d, Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( f'Unsupported box_mode_3d {box_mode_3d} for convertion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result(points, None, pred_bboxes, out_dir, file_name) ================================================ FILE: mmdet3d/models/detectors/parta2.py ================================================ import torch from torch.nn import functional as F from mmdet3d.ops import Voxelization from mmdet.models import DETECTORS from .. import builder from .two_stage import TwoStage3DDetector @DETECTORS.register_module() class PartA2(TwoStage3DDetector): r"""Part-A2 detector. Please refer to the `paper `_ """ def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(PartA2, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, ) self.voxel_layer = Voxelization(**voxel_layer) self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) self.middle_encoder = builder.build_middle_encoder(middle_encoder) def extract_feat(self, points, img_metas): """Extract features from points.""" voxel_dict = self.voxelize(points) voxel_features = self.voxel_encoder(voxel_dict['voxels'], voxel_dict['num_points'], voxel_dict['coors']) batch_size = voxel_dict['coors'][-1, 0].item() + 1 feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'], batch_size) x = self.backbone(feats_dict['spatial_features']) if self.with_neck: neck_feats = self.neck(x) feats_dict.update({'neck_feats': neck_feats}) return feats_dict, voxel_dict @torch.no_grad() def voxelize(self, points): """Apply hard voxelization to points.""" voxels, coors, num_points, voxel_centers = [], [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.voxel_layer(res) res_voxel_centers = ( res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor( self.voxel_layer.voxel_size) + res_voxels.new_tensor( self.voxel_layer.point_cloud_range[0:3]) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxel_centers.append(res_voxel_centers) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) voxel_centers = torch.cat(voxel_centers, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) voxel_dict = dict( voxels=voxels, num_points=num_points, coors=coors_batch, voxel_centers=voxel_centers) return voxel_dict def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, gt_bboxes_ignore=None, proposals=None): """Training forward function. Args: points (list[torch.Tensor]): Point cloud of each sample. img_metas (list[dict]): Meta information of each sample gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ feats_dict, voxels_dict = self.extract_feat(points, img_metas) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(feats_dict['neck_feats']) rpn_loss_inputs = rpn_outs + (gt_bboxes_3d, gt_labels_3d, img_metas) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict, img_metas, proposal_list, gt_bboxes_3d, gt_labels_3d) losses.update(roi_losses) return losses def simple_test(self, points, img_metas, proposals=None, rescale=False): """Test function without augmentaiton.""" feats_dict, voxels_dict = self.extract_feat(points, img_metas) if self.with_rpn: rpn_outs = self.rpn_head(feats_dict['neck_feats']) proposal_cfg = self.test_cfg.rpn bbox_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*bbox_inputs) else: proposal_list = proposals return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas, proposal_list) ================================================ FILE: mmdet3d/models/detectors/single_stage.py ================================================ from torch import nn as nn from mmdet.models import DETECTORS, build_backbone, build_head, build_neck from .base import Base3DDetector @DETECTORS.register_module() class SingleStage3DDetector(Base3DDetector): """SingleStage3DDetector. This class serves as a base class for single-stage 3D detectors. Args: backbone (dict): Config dict of detector's backbone. neck (dict, optional): Config dict of neck. Defaults to None. bbox_head (dict, optional): Config dict of box head. Defaults to None. train_cfg (dict, optional): Config dict of training hyper-parameters. Defaults to None. test_cfg (dict, optional): Config dict of test hyper-parameters. Defaults to None. pretrained (str, optional): Path of pretrained models. Defaults to None. """ def __init__(self, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(SingleStage3DDetector, self).__init__() self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) bbox_head.update(train_cfg=train_cfg) bbox_head.update(test_cfg=test_cfg) self.bbox_head = build_head(bbox_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) def init_weights(self, pretrained=None): """Initialize weights of detector.""" super(SingleStage3DDetector, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) if self.with_neck: if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() self.bbox_head.init_weights() def extract_feat(self, points, img_metas=None): """Directly extract features from the backbone+neck. Args: points (torch.Tensor): Input points. """ x = self.backbone(points) if self.with_neck: x = self.neck(x) return x def extract_feats(self, points, img_metas): """Extract features of multiple samples.""" return [ self.extract_feat(pts, img_meta) for pts, img_meta in zip(points, img_metas) ] ================================================ FILE: mmdet3d/models/detectors/sparsefusion.py ================================================ import mmcv import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import force_fp32 from os import path as osp from torch import nn as nn from torch.nn import functional as F import numpy as np import time from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, merge_aug_bboxes_3d, show_result) from mmdet3d.ops import Voxelization from mmdet.core import multi_apply from mmdet.models import DETECTORS from .. import builder from .mvx_two_stage import MVXTwoStageDetector from mmdet3d.ops import Voxelization @DETECTORS.register_module() class SparseFusionDetector(MVXTwoStageDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, **kwargs): super(SparseFusionDetector, self).__init__(**kwargs) self.freeze_img = kwargs.get('freeze_img', True) self.freeze_img_head = kwargs.get('freeze_img_head', False) self.init_weights(pretrained=kwargs.get('pretrained', None)) def init_weights(self, pretrained=None): """Initialize model weights.""" super(SparseFusionDetector, self).init_weights(pretrained) if self.freeze_img: if self.with_img_backbone: for param in self.img_backbone.parameters(): param.requires_grad = False if self.with_img_neck: for param in self.img_neck.parameters(): param.requires_grad = False if self.freeze_img_head: for param in self.pts_bbox_head.img_transformer.parameters(): param.requires_grad = False for param in self.pts_bbox_head.shared_conv_img.parameters(): param.requires_grad = False for param in self.pts_bbox_head.img_heatmap_head.parameters(): param.requires_grad = False def extract_img_feat(self, img, img_metas): """Extract features of images.""" if self.with_img_backbone and img is not None: input_shape = img.shape[-2:] # update real input shape of each single img for img_meta in img_metas: img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: img.squeeze_(0) elif img.dim() == 5 and img.size(0) > 1: B, N, C, H, W = img.size() img = img.view(B * N, C, H, W) img_feats = self.img_backbone(img.float()) else: return None if self.with_img_neck: img_feats = self.img_neck(img_feats) return img_feats def extract_voxel_heights(self, voxels, coors): batch_size = coors[-1, 0].item() + 1 grid_size = self.test_cfg['pts']['grid_size'] out_size_factor = self.test_cfg['pts']['out_size_factor'] height_num = grid_size[2] x_num = grid_size[0] // out_size_factor y_num = grid_size[1] // out_size_factor voxels_ = voxels[:, :, 2].clone() voxels_[voxels_==0] = 100 min_voxel = torch.min(voxels_, dim=-1)[0] voxels_[voxels_==100] = -200 max_voxel = torch.max(voxels_, dim=-1)[0] min_voxel_height = torch.zeros((batch_size, y_num, x_num, out_size_factor*out_size_factor)).to(voxels.device) + 100 max_voxel_height = torch.zeros((batch_size, y_num, x_num, out_size_factor*out_size_factor)).to(voxels.device) - 200 batch_ids = coors[:, 0].long() height_ids = coors[:, 1].long() y_ids = (coors[:, 2] // out_size_factor).long() x_ids = (coors[:, 3] // out_size_factor).long() y_offsets = (coors[:, 2] % out_size_factor).long() x_offsets = (coors[:, 3] % out_size_factor).long() for hid in range(height_num): height_mask = height_ids == hid batch_mask = batch_ids[height_mask] y_ids_mask = y_ids[height_mask] x_ids_mask = x_ids[height_mask] y_offsets_mask = y_offsets[height_mask] x_offsets_mask = x_offsets[height_mask] min_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask] = torch.minimum(min_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask], min_voxel[height_mask]) max_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask] = torch.maximum(max_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask], max_voxel[height_mask]) min_voxel_height = torch.min(min_voxel_height, dim=-1)[0] max_voxel_height = torch.max(max_voxel_height, dim=-1)[0] return min_voxel_height, max_voxel_height def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors, min_voxel_height, max_voxel_height = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) min_voxel_height = min_voxel_height[:, None] max_voxel_height = max_voxel_height[:, None] x[0] = torch.cat([x[0], min_voxel_height, max_voxel_height], dim=1) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points, number of points per voxel, and coordinates. """ voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) min_voxel_height, max_voxel_height = self.extract_voxel_heights(voxels, coors_batch) return voxels, num_points, coors_batch, min_voxel_height, max_voxel_height def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, gt_pts_centers_view=None, gt_img_centers_view=None, gt_bboxes_cam_view=None, img=None, sparse_depth=None, gt_visible_3d=None, gt_bboxes_lidar_view=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) losses = dict() if pts_feats: losses_pts = self.forward_pts_train( pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, img_metas, gt_bboxes_ignore, sparse_depth, gt_visible_3d, gt_bboxes_lidar_view ) losses.update(losses_pts) if img_feats: losses_img = self.forward_img_train( img_feats, img_metas=img_metas, gt_bboxes=gt_bboxes, gt_labels=gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, proposals=proposals) losses.update(losses_img) return losses def forward_pts_train(self, pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, img_metas, gt_bboxes_ignore=None, sparse_depth=None, gt_visible_3d=None, gt_bboxes_lidar_view=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats, img_feats, img_metas, sparse_depth) loss_inputs = [gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, img_metas, outs] losses = self.pts_bbox_head.loss(*loss_inputs) return losses def simple_test_pts(self, x, x_img, img_metas, rescale=False, sparse_depth=None): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x, x_img, img_metas, sparse_depth) bbox_list = self.pts_bbox_head.get_bboxes( outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def simple_test(self, points, img_metas, img=None, sparse_depth=None, rescale=False): """Test function without augmentaiton.""" img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) bbox_list = [dict() for i in range(len(img_metas))] if pts_feats and self.with_pts_bbox: bbox_pts = self.simple_test_pts( pts_feats, img_feats, img_metas, rescale=rescale, sparse_depth=sparse_depth) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox if img_feats and self.with_img_bbox: bbox_img = self.simple_test_img( img_feats, img_metas, rescale=rescale) for result_dict, img_bbox in zip(bbox_list, bbox_img): result_dict['img_bbox'] = img_bbox return bbox_list def forward_test(self, points, img_metas, img=None, sparse_depth=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(points) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(points), len(img_metas))) if num_augs == 1: img = [img] if img is None else img return self.simple_test(points[0], img_metas[0], img[0], sparse_depth[0], **kwargs) else: return self.aug_test(points, img_metas, img, **kwargs) ================================================ FILE: mmdet3d/models/detectors/ssd3dnet.py ================================================ from mmdet.models import DETECTORS from .votenet import VoteNet @DETECTORS.register_module() class SSD3DNet(VoteNet): """3DSSDNet model. https://arxiv.org/abs/2002.10187.pdf """ def __init__(self, backbone, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(SSD3DNet, self).__init__( backbone=backbone, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) ================================================ FILE: mmdet3d/models/detectors/transfusion.py ================================================ import mmcv import torch import time from mmcv.parallel import DataContainer as DC from mmcv.runner import force_fp32 from os import path as osp from torch import nn as nn from torch.nn import functional as F from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, merge_aug_bboxes_3d, show_result) from mmdet3d.ops import Voxelization from mmdet.core import multi_apply from mmdet.models import DETECTORS from .. import builder from .mvx_two_stage import MVXTwoStageDetector @DETECTORS.register_module() class TransFusionDetector(MVXTwoStageDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, **kwargs): super(TransFusionDetector, self).__init__(**kwargs) self.freeze_img = kwargs.get('freeze_img', True) self.init_weights(pretrained=kwargs.get('pretrained', None)) def init_weights(self, pretrained=None): """Initialize model weights.""" super(TransFusionDetector, self).init_weights(pretrained) if self.freeze_img: if self.with_img_backbone: for param in self.img_backbone.parameters(): param.requires_grad = False if self.with_img_neck: for param in self.img_neck.parameters(): param.requires_grad = False def extract_img_feat(self, img, img_metas): """Extract features of images.""" if self.with_img_backbone and img is not None: input_shape = img.shape[-2:] # update real input shape of each single img for img_meta in img_metas: img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: img.squeeze_(0) elif img.dim() == 5 and img.size(0) > 1: B, N, C, H, W = img.size() img = img.view(B * N, C, H, W) img_feats = self.img_backbone(img.float()) else: return None if self.with_img_neck: img_feats = self.img_neck(img_feats) return img_feats def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, ) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points, number of points per voxel, and coordinates. """ voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) losses = dict() if pts_feats: losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) if img_feats: losses_img = self.forward_img_train( img_feats, img_metas=img_metas, gt_bboxes=gt_bboxes, gt_labels=gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, proposals=proposals) losses.update(losses_img) return losses def forward_pts_train(self, pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats, img_feats, img_metas) loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] losses = self.pts_bbox_head.loss(*loss_inputs) return losses def simple_test_pts(self, x, x_img, img_metas, rescale=False): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x, x_img, img_metas) bbox_list = self.pts_bbox_head.get_bboxes( outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def simple_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) bbox_list = [dict() for i in range(len(img_metas))] if pts_feats and self.with_pts_bbox: bbox_pts = self.simple_test_pts( pts_feats, img_feats, img_metas, rescale=rescale) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox if img_feats and self.with_img_bbox: bbox_img = self.simple_test_img( img_feats, img_metas, rescale=rescale) for result_dict, img_bbox in zip(bbox_list, bbox_img): result_dict['img_bbox'] = img_bbox return bbox_list ================================================ FILE: mmdet3d/models/detectors/two_stage.py ================================================ from mmdet.models import DETECTORS, TwoStageDetector from .base import Base3DDetector @DETECTORS.register_module() class TwoStage3DDetector(Base3DDetector, TwoStageDetector): """Base class of two-stage 3D detector. It inherits original ``:class:TwoStageDetector`` and ``:class:Base3DDetector``. This class could serve as a base class for all two-stage 3D detectors. """ def __init__(self, **kwargs): super(TwoStage3DDetector, self).__init__(**kwargs) ================================================ FILE: mmdet3d/models/detectors/votenet.py ================================================ import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet.models import DETECTORS from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class VoteNet(SingleStage3DDetector): r"""`VoteNet `_ for 3D detection.""" def __init__(self, backbone, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(VoteNet, self).__init__( backbone=backbone, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, gt_bboxes_ignore=None): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): point-wise instance label of each batch. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: Losses. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod) loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) losses = self.bbox_head.loss( bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list): Image metas. rescale (bool): Whether to rescale results. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( points_cat, bbox_preds, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test with augmentation.""" points_cat = [torch.stack(pts) for pts in points] feats = self.extract_feats(points_cat, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, pts_cat, img_meta in zip(feats, points_cat, img_metas): bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( pts_cat, bbox_preds, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/voxelnet.py ================================================ import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.ops import Voxelization from mmdet.models import DETECTORS from .. import builder from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class VoxelNet(SingleStage3DDetector): r"""`VoxelNet `_ for 3D detection.""" def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(VoxelNet, self).__init__( backbone=backbone, neck=neck, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, ) self.voxel_layer = Voxelization(**voxel_layer) self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) self.middle_encoder = builder.build_middle_encoder(middle_encoder) def extract_feat(self, points, img_metas): """Extract features from points.""" voxels, num_points, coors = self.voxelize(points) voxel_features = self.voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0].item() + 1 x = self.middle_encoder(voxel_features, coors, batch_size) x = self.backbone(x) if self.with_neck: x = self.neck(x) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply hard voxelization to points.""" voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, gt_bboxes_ignore=None): """Training forward function. Args: points (list[torch.Tensor]): Point cloud of each sample. img_metas (list[dict]): Meta information of each sample gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ x = self.extract_feat(points, img_metas) outs = self.bbox_head(x) loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas) losses = self.bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Test function without augmentaiton.""" x = self.extract_feat(points, img_metas) outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" feats = self.extract_feats(points, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, img_meta in zip(feats, img_metas): outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/fusion_layers/__init__.py ================================================ from .coord_transform import (apply_3d_transformation, bbox_2d_transform, coord_2d_transform) from .point_fusion import PointFusion from .vote_fusion import VoteFusion __all__ = [ 'PointFusion', 'VoteFusion', 'apply_3d_transformation', 'bbox_2d_transform', 'coord_2d_transform' ] ================================================ FILE: mmdet3d/models/fusion_layers/coord_transform.py ================================================ import torch from functools import partial from mmdet3d.core.points import get_points_type def apply_3d_transformation(pcd, coords_type, img_meta, reverse=False): """Apply transformation to input point cloud. Args: pcd (torch.Tensor): The point cloud to be transformed. coords_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR' img_meta(dict): Meta info regarding data transformation. reverse (bool): Reversed transformation or not. Note: The elements in img_meta['transformation_3d_flow']: "T" stands for translation; "S" stands for scale; "R" stands for rotation; "HF" stands for horizontal flip; "VF" stands for vertical flip. Returns: torch.Tensor: The transformed point cloud. """ dtype = pcd.dtype device = pcd.device pcd_rotate_mat = ( torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device) if 'pcd_rotation' in img_meta else torch.eye( 3, dtype=dtype, device=device)) pcd_scale_factor = ( img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.) pcd_trans_factor = ( torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device) if 'pcd_trans' in img_meta else torch.zeros( (3), dtype=dtype, device=device)) pcd_horizontal_flip = img_meta[ 'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \ img_meta else False pcd_vertical_flip = img_meta[ 'pcd_vertical_flip'] if 'pcd_vertical_flip' in \ img_meta else False flow = img_meta['transformation_3d_flow'] \ if 'transformation_3d_flow' in img_meta else [] pcd = pcd.clone() # prevent inplace modification pcd = get_points_type(coords_type)(pcd) horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \ if pcd_horizontal_flip else lambda: None vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \ if pcd_vertical_flip else lambda: None if reverse: scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor) translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor) # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not # exactly an identity matrix # use angle to create the inverse rot matrix neither. rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse()) # reverse the pipeline flow = flow[::-1] else: scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor) translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor) rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat) flow_mapping = { 'T': translate_func, 'S': scale_func, 'R': rotate_func, 'HF': horizontal_flip_func, 'VF': vertical_flip_func } for op in flow: assert op in flow_mapping, f'This 3D data '\ f'transformation op ({op}) is not supported' func = flow_mapping[op] func() return pcd.coord def extract_2d_info(img_meta, tensor): """Extract image augmentation information from img_meta. Args: img_meta(dict): Meta info regarding data transformation. tensor(torch.Tensor): Input tensor used to create new ones. Returns: (int, int, int, int, torch.Tensor, bool, torch.Tensor): The extracted information. """ img_shape = img_meta['img_shape'] ori_shape = img_meta['ori_shape'] img_h, img_w, _ = img_shape ori_h, ori_w, _ = ori_shape img_scale_factor = ( tensor.new_tensor(img_meta['scale_factor'][:2]) if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0])) img_flip = img_meta['flip'] if 'flip' in img_meta else False img_crop_offset = ( tensor.new_tensor(img_meta['img_crop_offset']) if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0])) return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, img_crop_offset) def bbox_2d_transform(img_meta, bbox_2d, ori2new): """Transform 2d bbox according to img_meta. Args: img_meta(dict): Meta info regarding data transformation. bbox_2d (torch.Tensor): Shape (..., >4) The input 2d bboxes to transform. ori2new (bool): Origin img coord system to new or not. Returns: torch.Tensor: The transformed 2d bboxes. """ img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \ img_crop_offset = extract_2d_info(img_meta, bbox_2d) bbox_2d_new = bbox_2d.clone() if ori2new: bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1] bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1] if img_flip: bbox_2d_r = img_w - bbox_2d_new[:, 0] bbox_2d_l = img_w - bbox_2d_new[:, 2] bbox_2d_new[:, 0] = bbox_2d_l bbox_2d_new[:, 2] = bbox_2d_r else: if img_flip: bbox_2d_r = img_w - bbox_2d_new[:, 0] bbox_2d_l = img_w - bbox_2d_new[:, 2] bbox_2d_new[:, 0] = bbox_2d_l bbox_2d_new[:, 2] = bbox_2d_r bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1] bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1] return bbox_2d_new def coord_2d_transform(img_meta, coord_2d, ori2new): """Transform 2d pixel coordinates according to img_meta. Args: img_meta(dict): Meta info regarding data transformation. coord_2d (torch.Tensor): Shape (..., 2) The input 2d coords to transform. ori2new (bool): Origin img coord system to new or not. Returns: torch.Tensor: The transformed 2d coordinates. """ img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \ img_crop_offset = extract_2d_info(img_meta, coord_2d) coord_2d_new = coord_2d.clone() if ori2new: # TODO here we assume this order of transformation coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0] coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1] coord_2d_new[..., 0] += img_crop_offset[0] coord_2d_new[..., 1] += img_crop_offset[1] # flip uv coordinates and bbox if img_flip: coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0] else: if img_flip: coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0] coord_2d_new[..., 0] -= img_crop_offset[0] coord_2d_new[..., 1] -= img_crop_offset[1] coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0] coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1] return coord_2d_new ================================================ FILE: mmdet3d/models/fusion_layers/point_fusion.py ================================================ import torch from mmcv.cnn import ConvModule, xavier_init from torch import nn as nn from torch.nn import functional as F from ..registry import FUSION_LAYERS from . import apply_3d_transformation def point_sample( img_meta, img_features, points, lidar2img_rt, img_scale_factor, img_crop_offset, img_flip, img_pad_shape, img_shape, aligned=True, padding_mode='zeros', align_corners=True, ): """Obtain image features using points. Args: img_meta (dict): Meta info. img_features (torch.Tensor): 1 x C x H x W image features. points (torch.Tensor): Nx3 point cloud in LiDAR coordinates. lidar2img_rt (torch.Tensor): 4x4 transformation matrix. img_scale_factor (torch.Tensor): Scale factor with shape of \ (w_scale, h_scale). img_crop_offset (torch.Tensor): Crop offset used to crop \ image during data augmentation with shape of (w_offset, h_offset). img_flip (bool): Whether the image is flipped. img_pad_shape (tuple[int]): int tuple indicates the h & w after padding, this is necessary to obtain features in feature map. img_shape (tuple[int]): int tuple indicates the h & w before padding after scaling, this is necessary for flipping coordinates. aligned (bool, optional): Whether use bilinear interpolation when sampling image features for each point. Defaults to True. padding_mode (str, optional): Padding mode when padding values for features of out-of-image points. Defaults to 'zeros'. align_corners (bool, optional): Whether to align corners when sampling image features for each point. Defaults to True. Returns: torch.Tensor: NxC image features sampled by point coordinates. """ # apply transformation based on info in img_meta points = apply_3d_transformation(points, 'LIDAR', img_meta, reverse=True) # project points from velo coordinate to camera coordinate num_points = points.shape[0] pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1) pts_2d = pts_4d @ lidar2img_rt.t() # cam_points is Tensor of Nx4 whose last column is 1 # transform camera coordinate to image coordinate pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] # img transformation: scale -> crop -> flip # the image is resized by img_scale_factor img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2 img_coors -= img_crop_offset # grid sample, the valid grid range should be in [-1,1] coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1 if img_flip: # by default we take it as horizontal flip # use img_shape before padding for flip orig_h, orig_w = img_shape coor_x = orig_w - coor_x h, w = img_pad_shape coor_y = coor_y / h * 2 - 1 coor_x = coor_x / w * 2 - 1 grid = torch.cat([coor_x, coor_y], dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2 # align_corner=True provides higher performance mode = 'bilinear' if aligned else 'nearest' point_features = F.grid_sample( img_features, grid, mode=mode, padding_mode=padding_mode, align_corners=align_corners) # 1xCx1xN feats return point_features.squeeze().t() @FUSION_LAYERS.register_module() class PointFusion(nn.Module): """Fuse image features from multi-scale features. Args: img_channels (list[int] | int): Channels of image features. It could be a list if the input is multi-scale image features. pts_channels (int): Channels of point features mid_channels (int): Channels of middle layers out_channels (int): Channels of output fused features img_levels (int, optional): Number of image levels. Defaults to 3. conv_cfg (dict, optional): Dict config of conv layers of middle layers. Defaults to None. norm_cfg (dict, optional): Dict config of norm layers of middle layers. Defaults to None. act_cfg (dict, optional): Dict config of activatation layers. Defaults to None. activate_out (bool, optional): Whether to apply relu activation to output features. Defaults to True. fuse_out (bool, optional): Whether apply conv layer to the fused features. Defaults to False. dropout_ratio (int, float, optional): Dropout ratio of image features to prevent overfitting. Defaults to 0. aligned (bool, optional): Whether apply aligned feature fusion. Defaults to True. align_corners (bool, optional): Whether to align corner when sampling features according to points. Defaults to True. padding_mode (str, optional): Mode used to pad the features of points that do not have corresponding image features. Defaults to 'zeros'. lateral_conv (bool, optional): Whether to apply lateral convs to image features. Defaults to True. """ def __init__(self, img_channels, pts_channels, mid_channels, out_channels, img_levels=3, conv_cfg=None, norm_cfg=None, act_cfg=None, activate_out=True, fuse_out=False, dropout_ratio=0, aligned=True, align_corners=True, padding_mode='zeros', lateral_conv=True): super(PointFusion, self).__init__() if isinstance(img_levels, int): img_levels = [img_levels] if isinstance(img_channels, int): img_channels = [img_channels] * len(img_levels) assert isinstance(img_levels, list) assert isinstance(img_channels, list) assert len(img_channels) == len(img_levels) self.img_levels = img_levels self.act_cfg = act_cfg self.activate_out = activate_out self.fuse_out = fuse_out self.dropout_ratio = dropout_ratio self.img_channels = img_channels self.aligned = aligned self.align_corners = align_corners self.padding_mode = padding_mode self.lateral_convs = None if lateral_conv: self.lateral_convs = nn.ModuleList() for i in range(len(img_channels)): l_conv = ConvModule( img_channels[i], mid_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=self.act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.img_transform = nn.Sequential( nn.Linear(mid_channels * len(img_channels), out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) else: self.img_transform = nn.Sequential( nn.Linear(sum(img_channels), out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) self.pts_transform = nn.Sequential( nn.Linear(pts_channels, out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) if self.fuse_out: self.fuse_conv = nn.Sequential( nn.Linear(mid_channels, out_channels), # For pts the BN is initialized differently by default # TODO: check whether this is necessary nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), nn.ReLU(inplace=False)) self.init_weights() # default init_weights for conv(msra) and norm in ConvModule def init_weights(self): """Initialize the weights of modules.""" for m in self.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): xavier_init(m, distribution='uniform') def forward(self, img_feats, pts, pts_feats, img_metas): """Forward function. Args: img_feats (list[torch.Tensor]): Image features. pts: [list[torch.Tensor]]: A batch of points with shape N x 3. pts_feats (torch.Tensor): A tensor consist of point features of the total batch. img_metas (list[dict]): Meta information of images. Returns: torch.Tensor: Fused features of each point. """ img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas) img_pre_fuse = self.img_transform(img_pts) if self.training and self.dropout_ratio > 0: img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio) pts_pre_fuse = self.pts_transform(pts_feats) fuse_out = img_pre_fuse + pts_pre_fuse if self.activate_out: fuse_out = F.relu(fuse_out) if self.fuse_out: fuse_out = self.fuse_conv(fuse_out) return fuse_out def obtain_mlvl_feats(self, img_feats, pts, img_metas): """Obtain multi-level features for each point. Args: img_feats (list(torch.Tensor)): Multi-scale image features produced by image backbone in shape (N, C, H, W). pts (list[torch.Tensor]): Points of each sample. img_metas (list[dict]): Meta information for each sample. Returns: torch.Tensor: Corresponding image features of each point. """ if self.lateral_convs is not None: img_ins = [ lateral_conv(img_feats[i]) for i, lateral_conv in zip(self.img_levels, self.lateral_convs) ] else: img_ins = img_feats img_feats_per_point = [] # Sample multi-level features for i in range(len(img_metas)): mlvl_img_feats = [] for level in range(len(self.img_levels)): mlvl_img_feats.append( self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3], img_metas[i])) mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1) img_feats_per_point.append(mlvl_img_feats) img_pts = torch.cat(img_feats_per_point, dim=0) return img_pts def sample_single(self, img_feats, pts, img_meta): """Sample features from single level image feature map. Args: img_feats (torch.Tensor): Image feature map in shape (1, C, H, W). pts (torch.Tensor): Points of a single sample. img_meta (dict): Meta information of the single sample. Returns: torch.Tensor: Single level image features of each point. """ # TODO: image transformation also extracted img_scale_factor = ( pts.new_tensor(img_meta['scale_factor'][:2]) if 'scale_factor' in img_meta.keys() else 1) img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False img_crop_offset = ( pts.new_tensor(img_meta['img_crop_offset']) if 'img_crop_offset' in img_meta.keys() else 0) img_pts = point_sample( img_meta, img_feats, pts, pts.new_tensor(img_meta['lidar2img']), img_scale_factor, img_crop_offset, img_flip=img_flip, img_pad_shape=img_meta['input_shape'][:2], img_shape=img_meta['img_shape'][:2], aligned=self.aligned, padding_mode=self.padding_mode, align_corners=self.align_corners, ) return img_pts ================================================ FILE: mmdet3d/models/fusion_layers/vote_fusion.py ================================================ import torch from torch import nn as nn from mmdet3d.core.bbox import Coord3DMode, points_cam2img from ..registry import FUSION_LAYERS from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform EPS = 1e-6 @FUSION_LAYERS.register_module() class VoteFusion(nn.Module): """Fuse 2d features from 3d seeds. Args: num_classes (int): number of classes. max_imvote_per_pixel (int): max number of imvotes. """ def __init__(self, num_classes=10, max_imvote_per_pixel=3): super(VoteFusion, self).__init__() self.num_classes = num_classes self.max_imvote_per_pixel = max_imvote_per_pixel def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas, calibs): """Forward function. Args: imgs (list[torch.Tensor]): Image features. bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes. seeds_3d_depth (torch.Tensor): 3D seeds. img_metas (list[dict]): Meta information of images. calibs: Camera calibration information of the images. Returns: torch.Tensor: Concatenated cues of each point. torch.Tensor: Validity mask of each feature. """ img_features = [] masks = [] for i, data in enumerate( zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)): img, bbox_2d_rescaled, seed_3d_depth, img_meta = data bbox_num = bbox_2d_rescaled.shape[0] seed_num = seed_3d_depth.shape[0] img_shape = img_meta['img_shape'] img_h, img_w, _ = img_shape # first reverse the data transformations xyz_depth = apply_3d_transformation( seed_3d_depth, 'DEPTH', img_meta, reverse=True) # then convert from depth coords to camera coords xyz_cam = Coord3DMode.convert_point( xyz_depth, Coord3DMode.DEPTH, Coord3DMode.CAM, rt_mat=calibs['Rt'][i]) # project to 2d to get image coords (uv) uv_origin = points_cam2img(xyz_cam, calibs['K'][i]) uv_origin = (uv_origin - 1).round() # rescale 2d coordinates and bboxes uv_rescaled = coord_2d_transform(img_meta, uv_origin, True) bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled, False) if bbox_num == 0: imvote_num = seed_num * self.max_imvote_per_pixel # use zero features two_cues = torch.zeros((15, imvote_num), device=seed_3d_depth.device) mask_zero = torch.zeros( imvote_num - seed_num, device=seed_3d_depth.device).bool() mask_one = torch.ones( seed_num, device=seed_3d_depth.device).bool() mask = torch.cat([mask_one, mask_zero], dim=0) else: # expand bboxes and seeds bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand( seed_num, -1, -1) seed_2d_expanded = uv_origin.view(seed_num, 1, -1).expand(-1, bbox_num, -1) seed_2d_expanded_x, seed_2d_expanded_y = \ seed_2d_expanded.split(1, dim=-1) bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \ bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \ bbox_expanded.split(1, dim=-1) bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2 bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2 seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \ (seed_2d_expanded_x < bbox_expanded_r) seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \ (seed_2d_expanded_y < bbox_expanded_b) seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y # semantic cues, dim=class_num sem_cue = torch.zeros_like(bbox_expanded_conf).expand( -1, -1, self.num_classes) sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(), bbox_expanded_conf) # bbox center - uv delta_u = bbox_expanded_midx - seed_2d_expanded_x delta_v = bbox_expanded_midy - seed_2d_expanded_y seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand( -1, bbox_num, -1) z_cam = xyz_cam[..., 2:3].view(seed_num, 1, 1).expand(-1, bbox_num, -1) delta_u = delta_u * z_cam / calibs['K'][i, 0, 0] delta_v = delta_v * z_cam / calibs['K'][i, 0, 0] imvote = torch.cat( [delta_u, delta_v, torch.zeros_like(delta_v)], dim=-1).view(-1, 3) # convert from camera coords to depth coords imvote = Coord3DMode.convert_point( imvote.view((-1, 3)), Coord3DMode.CAM, Coord3DMode.DEPTH, rt_mat=calibs['Rt'][i]) # apply transformation to lifted imvotes imvote = apply_3d_transformation( imvote, 'DEPTH', img_meta, reverse=False) seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape) # ray angle ray_angle = seed_3d_expanded + imvote ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) + EPS).unsqueeze(-1) # imvote lifted to 3d xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \ * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]] # geometric cues, dim=5 geo_cue = torch.cat([xz, ray_angle], dim=-1).view(seed_num, -1, 5) two_cues = torch.cat([geo_cue, sem_cue], dim=-1) # mask to 0 if seed not in bbox two_cues = two_cues * seed_2d_in_bbox.float() feature_size = two_cues.shape[-1] # if bbox number is too small, append zeros if bbox_num < self.max_imvote_per_pixel: append_num = self.max_imvote_per_pixel - bbox_num append_zeros = torch.zeros( (seed_num, append_num, 1), device=seed_2d_in_bbox.device).bool() seed_2d_in_bbox = torch.cat( [seed_2d_in_bbox, append_zeros], dim=1) append_zeros = torch.zeros( (seed_num, append_num, feature_size), device=two_cues.device) two_cues = torch.cat([two_cues, append_zeros], dim=1) append_zeros = torch.zeros((seed_num, append_num, 1), device=two_cues.device) bbox_expanded_conf = torch.cat( [bbox_expanded_conf, append_zeros], dim=1) # sort the valid seed-bbox pair according to confidence pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf # and find the largests mask, indices = pair_score.topk( self.max_imvote_per_pixel, dim=1, largest=True, sorted=True) indices_img = indices.expand(-1, -1, feature_size) two_cues = two_cues.gather(dim=1, index=indices_img) two_cues = two_cues.transpose(1, 0) two_cues = two_cues.reshape(-1, feature_size).transpose( 1, 0).contiguous() # since conf is ~ (0, 1), floor gives us validity mask = mask.floor().int() mask = mask.transpose(1, 0).reshape(-1).bool() # clear the padding img = img[:, :img_shape[0], :img_shape[1]] img_flatten = img.reshape(3, -1).float() img_flatten /= 255. # take the normalized pixel value as texture cue uv_flatten = uv_rescaled[:, 1].round() * \ img_shape[1] + uv_rescaled[:, 0].round() uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long() txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded) txt_cue = txt_cue.unsqueeze(1).expand(-1, self.max_imvote_per_pixel, -1).reshape(3, -1) # append texture cue img_feature = torch.cat([two_cues, txt_cue], dim=0) img_features.append(img_feature) masks.append(mask) return torch.stack(img_features, 0), torch.stack(masks, 0) ================================================ FILE: mmdet3d/models/losses/__init__.py ================================================ from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss from .chamfer_distance import ChamferDistance, chamfer_distance from .uncertainty_loss import LaplaceL1Loss __all__ = [ 'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance', 'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss', 'LaplaceL1Loss' ] ================================================ FILE: mmdet3d/models/losses/axis_aligned_iou_loss.py ================================================ import torch from torch import nn as nn from mmdet.models.builder import LOSSES from mmdet.models.losses.utils import weighted_loss from ...core.bbox import AxisAlignedBboxOverlaps3D @weighted_loss def axis_aligned_iou_loss(pred, target): """Calculate the IoU loss (1-IoU) of two set of axis aligned bounding boxes. Note that predictions and targets are one-to-one corresponded. Args: pred (torch.Tensor): Bbox predictions with shape [..., 3]. target (torch.Tensor): Bbox targets (gt) with shape [..., 3]. Returns: torch.Tensor: IoU loss between predictions and targets. """ axis_aligned_iou = AxisAlignedBboxOverlaps3D()( pred, target, is_aligned=True) iou_loss = 1 - axis_aligned_iou return iou_loss @LOSSES.register_module() class AxisAlignedIoULoss(nn.Module): """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes. Args: reduction (str): Method to reduce losses. The valid reduction method are none, sum or mean. loss_weight (float, optional): Weight of loss. Defaults to 1.0. """ def __init__(self, reduction='mean', loss_weight=1.0): super(AxisAlignedIoULoss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function of loss calculation. Args: pred (torch.Tensor): Bbox predictions with shape [..., 3]. target (torch.Tensor): Bbox targets (gt) with shape [..., 3]. weight (torch.Tensor|float, optional): Weight of loss. \ Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. Returns: torch.Tensor: IoU loss between predictions and targets. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if (weight is not None) and (not torch.any(weight > 0)) and ( reduction != 'none'): return (pred * weight).sum() return axis_aligned_iou_loss( pred, target, weight=weight, avg_factor=avg_factor, reduction=reduction) * self.loss_weight ================================================ FILE: mmdet3d/models/losses/chamfer_distance.py ================================================ import torch from torch import nn as nn from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss from mmdet.models.builder import LOSSES def chamfer_distance(src, dst, src_weight=1.0, dst_weight=1.0, criterion_mode='l2', reduction='mean'): """Calculate Chamfer Distance of two sets. Args: src (torch.Tensor): Source set with shape [B, N, C] to calculate Chamfer Distance. dst (torch.Tensor): Destination set with shape [B, M, C] to calculate Chamfer Distance. src_weight (torch.Tensor or float): Weight of source loss. dst_weight (torch.Tensor or float): Weight of destination loss. criterion_mode (str): Criterion mode to calculate distance. The valid modes are smooth_l1, l1 or l2. reduction (str): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Returns: tuple: Source and Destination loss with the corresponding indices. - loss_src (torch.Tensor): The min distance \ from source to destination. - loss_dst (torch.Tensor): The min distance \ from destination to source. - indices1 (torch.Tensor): Index the min distance point \ for each point in source to destination. - indices2 (torch.Tensor): Index the min distance point \ for each point in destination to source. """ if criterion_mode == 'smooth_l1': criterion = smooth_l1_loss elif criterion_mode == 'l1': criterion = l1_loss elif criterion_mode == 'l2': criterion = mse_loss else: raise NotImplementedError src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1) dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1) distance = criterion(src_expand, dst_expand, reduction='none').sum(-1) src2dst_distance, indices1 = torch.min(distance, dim=2) # (B,N) dst2src_distance, indices2 = torch.min(distance, dim=1) # (B,M) loss_src = (src2dst_distance * src_weight) loss_dst = (dst2src_distance * dst_weight) if reduction == 'sum': loss_src = torch.sum(loss_src) loss_dst = torch.sum(loss_dst) elif reduction == 'mean': loss_src = torch.mean(loss_src) loss_dst = torch.mean(loss_dst) elif reduction == 'none': pass else: raise NotImplementedError return loss_src, loss_dst, indices1, indices2 @LOSSES.register_module() class ChamferDistance(nn.Module): """Calculate Chamfer Distance of two sets. Args: mode (str): Criterion mode to calculate distance. The valid modes are smooth_l1, l1 or l2. reduction (str): Method to reduce losses. The valid reduction method are none, sum or mean. loss_src_weight (float): Weight of loss_source. loss_dst_weight (float): Weight of loss_target. """ def __init__(self, mode='l2', reduction='mean', loss_src_weight=1.0, loss_dst_weight=1.0): super(ChamferDistance, self).__init__() assert mode in ['smooth_l1', 'l1', 'l2'] assert reduction in ['none', 'sum', 'mean'] self.mode = mode self.reduction = reduction self.loss_src_weight = loss_src_weight self.loss_dst_weight = loss_dst_weight def forward(self, source, target, src_weight=1.0, dst_weight=1.0, reduction_override=None, return_indices=False, **kwargs): """Forward function of loss calculation. Args: source (torch.Tensor): Source set with shape [B, N, C] to calculate Chamfer Distance. target (torch.Tensor): Destination set with shape [B, M, C] to calculate Chamfer Distance. src_weight (torch.Tensor | float, optional): Weight of source loss. Defaults to 1.0. dst_weight (torch.Tensor | float, optional): Weight of destination loss. Defaults to 1.0. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. return_indices (bool, optional): Whether to return indices. Defaults to False. Returns: tuple[torch.Tensor]: If ``return_indices=True``, return losses of \ source and target with their corresponding indices in the \ order of ``(loss_source, loss_target, indices1, indices2)``. \ If ``return_indices=False``, return \ ``(loss_source, loss_target)``. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_source, loss_target, indices1, indices2 = chamfer_distance( source, target, src_weight, dst_weight, self.mode, reduction) loss_source *= self.loss_src_weight loss_target *= self.loss_dst_weight if return_indices: return loss_source, loss_target, indices1, indices2 else: return loss_source, loss_target ================================================ FILE: mmdet3d/models/losses/uncertainty_loss.py ================================================ import torch from torch import nn as nn from mmdet.models.builder import LOSSES from mmdet.models.losses.utils import weighted_loss @weighted_loss def laplacian_aleatoric_uncertainty_loss(pred, target): ''' References: MonoPair: Monocular 3D Object Detection Using Pairwise Spatial Relationships, CVPR'20 Geometry and Uncertainty in Deep Learning for Computer Vision, University of Cambridge ''' log_variance = pred[..., 1:] pred = pred[..., :1] if target.numel() == 0: return pred.sum() * 0 assert pred.size() == target.size() assert pred.size() == log_variance.size() loss = 1.4142 * torch.exp(-log_variance) * torch.abs(pred - target) + log_variance return loss @LOSSES.register_module() class LaplaceL1Loss(nn.Module): """L1 loss. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. """ def __init__(self, reduction='mean', loss_weight=1.0): super(LaplaceL1Loss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = laplacian_aleatoric_uncertainty_loss(pred, target, weight=weight, reduction=reduction, avg_factor=avg_factor) loss_bbox = self.loss_weight * loss return loss_bbox ================================================ FILE: mmdet3d/models/middle_encoders/__init__.py ================================================ from .pillar_scatter import PointPillarsScatter from .sparse_encoder import SparseEncoder from .sparse_unet import SparseUNet __all__ = ['PointPillarsScatter', 'SparseEncoder', 'SparseUNet'] ================================================ FILE: mmdet3d/models/middle_encoders/pillar_scatter.py ================================================ import torch from mmcv.runner import auto_fp16 from torch import nn from ..registry import MIDDLE_ENCODERS @MIDDLE_ENCODERS.register_module() class PointPillarsScatter(nn.Module): """Point Pillar's Scatter. Converts learned features from dense tensor to sparse pseudo image. Args: in_channels (int): Channels of input features. output_shape (list[int]): Required output shape of features. """ def __init__(self, in_channels, output_shape): super().__init__() self.output_shape = output_shape self.ny = output_shape[0] self.nx = output_shape[1] self.in_channels = in_channels self.fp16_enabled = False @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size=None): """Foraward function to scatter features.""" # TODO: rewrite the function in a batch manner # no need to deal with different batch cases if batch_size is not None: return self.forward_batch(voxel_features, coors, batch_size) else: return self.forward_single(voxel_features, coors) def forward_single(self, voxel_features, coors): """Scatter features of single sample. Args: voxel_features (torch.Tensor): Voxel features in shape (N, M, C). coors (torch.Tensor): Coordinates of each voxel. The first column indicates the sample ID. """ # Create the canvas for this sample canvas = torch.zeros( self.in_channels, self.nx * self.ny, dtype=voxel_features.dtype, device=voxel_features.device) indices = coors[:, 1] * self.nx + coors[:, 2] indices = indices.long() voxels = voxel_features.t() # Now scatter the blob back to the canvas. canvas[:, indices] = voxels # Undo the column stacking to final 4-dim tensor canvas = canvas.view(1, self.in_channels, self.ny, self.nx) return [canvas] def forward_batch(self, voxel_features, coors, batch_size): """Scatter features of single sample. Args: voxel_features (torch.Tensor): Voxel features in shape (N, M, C). coors (torch.Tensor): Coordinates of each voxel in shape (N, 4). The first column indicates the sample ID. batch_size (int): Number of samples in the current batch. """ # batch_canvas will be the final output. batch_canvas = [] for batch_itt in range(batch_size): # Create the canvas for this sample canvas = torch.zeros( self.in_channels, self.nx * self.ny, dtype=voxel_features.dtype, device=voxel_features.device) # Only include non-empty pillars batch_mask = coors[:, 0] == batch_itt this_coors = coors[batch_mask, :] indices = this_coors[:, 2] * self.nx + this_coors[:, 3] indices = indices.type(torch.long) voxels = voxel_features[batch_mask, :] voxels = voxels.t() # Now scatter the blob back to the canvas. canvas[:, indices] = voxels # Append to a list for later stacking. batch_canvas.append(canvas) # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols) batch_canvas = torch.stack(batch_canvas, 0) # Undo the column stacking to final 4-dim tensor batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny, self.nx) return batch_canvas ================================================ FILE: mmdet3d/models/middle_encoders/sparse_encoder.py ================================================ from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule from mmdet3d.ops import spconv as spconv from ..registry import MIDDLE_ENCODERS @MIDDLE_ENCODERS.register_module() class SparseEncoder(nn.Module): r"""Sparse encoder for SECOND and Part-A2. Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. order (list[str]): Order of conv module. Defaults to ('conv', 'norm', 'act'). norm_cfg (dict): Config of normalization layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). base_channels (int): Out channels for conv_input layer. Defaults to 16. output_channels (int): Out channels for conv_out layer. Defaults to 128. encoder_channels (tuple[tuple[int]]): Convolutional channels of each encode block. encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). block_type (str): Type of the block to use. Defaults to 'conv_module'. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), block_type='conv_module'): super().__init__() assert block_type in ['conv_module', 'basicblock'] self.sparse_shape = sparse_shape self.in_channels = in_channels self.order = order self.base_channels = base_channels self.output_channels = output_channels self.encoder_channels = encoder_channels self.encoder_paddings = encoder_paddings self.stage_num = len(self.encoder_channels) self.fp16_enabled = False # Spconv init all weight on its own assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {'conv', 'norm', 'act'} if self.order[0] != 'conv': # pre activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d', order=('conv', )) else: # post activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d') encoder_out_channels = self.make_encoder_layers( make_sparse_convmodule, norm_cfg, self.base_channels, block_type=block_type) self.conv_out = make_sparse_convmodule( encoder_out_channels, self.output_channels, kernel_size=(3, 1, 1), stride=(2, 1, 1), norm_cfg=norm_cfg, padding=0, indice_key='spconv_down2', conv_type='SparseConv3d') @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size): """Forward of SparseEncoder. Args: voxel_features (torch.float32): Voxel features in shape (N, C). coors (torch.int32): Coordinates in shape (N, 4), \ the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. Returns: dict: Backbone features. """ coors = coors.int() input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.view(N, C * D, H, W) return spatial_features def make_encoder_layers(self, make_block, norm_cfg, in_channels, block_type='conv_module', conv_cfg=dict(type='SubMConv3d')): """make encoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. block_type (str): Type of the block to use. Defaults to 'conv_module'. conv_cfg (dict): Config of conv layer. Defaults to dict(type='SubMConv3d'). Returns: int: The number of encoder output channels. """ assert block_type in ['conv_module', 'basicblock'] self.encoder_layers = spconv.SparseSequential() for i, blocks in enumerate(self.encoder_channels): blocks_list = [] for j, out_channels in enumerate(tuple(blocks)): padding = tuple(self.encoder_paddings[i])[j] # each stage started with a spconv layer # except the first stage if i != 0 and j == 0 and block_type == 'conv_module': blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) elif block_type == 'basicblock': if j == len(blocks) - 1 and i != len( self.encoder_channels) - 1: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) else: blocks_list.append( SparseBasicBlock( out_channels, out_channels, norm_cfg=norm_cfg, conv_cfg=conv_cfg)) else: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, padding=padding, indice_key=f'subm{i + 1}', conv_type='SubMConv3d')) in_channels = out_channels stage_name = f'encoder_layer{i + 1}' stage_layers = spconv.SparseSequential(*blocks_list) self.encoder_layers.add_module(stage_name, stage_layers) return out_channels ================================================ FILE: mmdet3d/models/middle_encoders/sparse_unet.py ================================================ import torch from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule from mmdet3d.ops import spconv as spconv from ..registry import MIDDLE_ENCODERS @MIDDLE_ENCODERS.register_module() class SparseUNet(nn.Module): r"""SparseUNet for PartA^2. See the `paper `_ for more detials. Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. norm_cfg (dict): Config of normalization layer. base_channels (int): Out channels for conv_input layer. output_channels (int): Out channels for conv_out layer. encoder_channels (tuple[tuple[int]]): Convolutional channels of each encode block. encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. decoder_channels (tuple[tuple[int]]): Convolutional channels of each decode block. decoder_paddings (tuple[tuple[int]]): Paddings of each decode block. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16), (16, 16, 16)), decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1))): super().__init__() self.sparse_shape = sparse_shape self.in_channels = in_channels self.order = order self.base_channels = base_channels self.output_channels = output_channels self.encoder_channels = encoder_channels self.encoder_paddings = encoder_paddings self.decoder_channels = decoder_channels self.decoder_paddings = decoder_paddings self.stage_num = len(self.encoder_channels) self.fp16_enabled = False # Spconv init all weight on its own assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {'conv', 'norm', 'act'} if self.order[0] != 'conv': # pre activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d', order=('conv', )) else: # post activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d') encoder_out_channels = self.make_encoder_layers( make_sparse_convmodule, norm_cfg, self.base_channels) self.make_decoder_layers(make_sparse_convmodule, norm_cfg, encoder_out_channels) self.conv_out = make_sparse_convmodule( encoder_out_channels, self.output_channels, kernel_size=(3, 1, 1), stride=(2, 1, 1), norm_cfg=norm_cfg, padding=0, indice_key='spconv_down2', conv_type='SparseConv3d') @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size): """Forward of SparseUNet. Args: voxel_features (torch.float32): Voxel features in shape [N, C]. coors (torch.int32): Coordinates in shape [N, 4], the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. Returns: dict[str, torch.Tensor]: Backbone features. """ coors = coors.int() input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.view(N, C * D, H, W) # for segmentation head, with output shape: # [400, 352, 11] <- [200, 176, 5] # [800, 704, 21] <- [400, 352, 11] # [1600, 1408, 41] <- [800, 704, 21] # [1600, 1408, 41] <- [1600, 1408, 41] decode_features = [] x = encode_features[-1] for i in range(self.stage_num, 0, -1): x = self.decoder_layer_forward(encode_features[i - 1], x, getattr(self, f'lateral_layer{i}'), getattr(self, f'merge_layer{i}'), getattr(self, f'upsample_layer{i}')) decode_features.append(x) seg_features = decode_features[-1].features ret = dict( spatial_features=spatial_features, seg_features=seg_features) return ret def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer, merge_layer, upsample_layer): """Forward of upsample and residual block. Args: x_lateral (:obj:`SparseConvTensor`): Lateral tensor. x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer. lateral_layer (SparseBasicBlock): Convolution for lateral tensor. merge_layer (SparseSequential): Convolution for merging features. upsample_layer (SparseSequential): Convolution for upsampling. Returns: :obj:`SparseConvTensor`: Upsampled feature. """ x = lateral_layer(x_lateral) x.features = torch.cat((x_bottom.features, x.features), dim=1) x_merge = merge_layer(x) x = self.reduce_channel(x, x_merge.features.shape[1]) x.features = x_merge.features + x.features x = upsample_layer(x) return x @staticmethod def reduce_channel(x, out_channels): """reduce channel for element-wise addition. Args: x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features`` are in shape (N, C1). out_channels (int): The number of channel after reduction. Returns: :obj:`SparseConvTensor`: Channel reduced feature. """ features = x.features n, in_channels = features.shape assert (in_channels % out_channels == 0) and (in_channels >= out_channels) x.features = features.view(n, out_channels, -1).sum(dim=2) return x def make_encoder_layers(self, make_block, norm_cfg, in_channels): """make encoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. Returns: int: The number of encoder output channels. """ self.encoder_layers = spconv.SparseSequential() for i, blocks in enumerate(self.encoder_channels): blocks_list = [] for j, out_channels in enumerate(tuple(blocks)): padding = tuple(self.encoder_paddings[i])[j] # each stage started with a spconv layer # except the first stage if i != 0 and j == 0: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) else: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, padding=padding, indice_key=f'subm{i + 1}', conv_type='SubMConv3d')) in_channels = out_channels stage_name = f'encoder_layer{i + 1}' stage_layers = spconv.SparseSequential(*blocks_list) self.encoder_layers.add_module(stage_name, stage_layers) return out_channels def make_decoder_layers(self, make_block, norm_cfg, in_channels): """make decoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. Returns: int: The number of encoder output channels. """ block_num = len(self.decoder_channels) for i, block_channels in enumerate(self.decoder_channels): paddings = self.decoder_paddings[i] setattr( self, f'lateral_layer{block_num - i}', SparseBasicBlock( in_channels, block_channels[0], conv_cfg=dict( type='SubMConv3d', indice_key=f'subm{block_num - i}'), norm_cfg=norm_cfg)) setattr( self, f'merge_layer{block_num - i}', make_block( in_channels * 2, block_channels[1], 3, norm_cfg=norm_cfg, padding=paddings[0], indice_key=f'subm{block_num - i}', conv_type='SubMConv3d')) if block_num - i != 1: setattr( self, f'upsample_layer{block_num - i}', make_block( in_channels, block_channels[2], 3, norm_cfg=norm_cfg, indice_key=f'spconv{block_num - i}', conv_type='SparseInverseConv3d')) else: # use submanifold conv instead of inverse conv # in the last block setattr( self, f'upsample_layer{block_num - i}', make_block( in_channels, block_channels[2], 3, norm_cfg=norm_cfg, padding=paddings[1], indice_key='subm1', conv_type='SubMConv3d')) in_channels = block_channels[2] ================================================ FILE: mmdet3d/models/model_utils/__init__.py ================================================ from .vote_module import VoteModule __all__ = ['VoteModule'] ================================================ FILE: mmdet3d/models/model_utils/vote_module.py ================================================ import torch from mmcv import is_tuple_of from mmcv.cnn import ConvModule from torch import nn as nn from mmdet3d.models.builder import build_loss class VoteModule(nn.Module): """Vote module. Generate votes from seed point features. Args: in_channels (int): Number of channels of seed point features. vote_per_seed (int): Number of votes generated from each seed point. gt_per_seed (int): Number of ground truth votes generated from each seed point. num_points (int): Number of points to be used for voting. conv_channels (tuple[int]): Out channels of vote generating convolution. conv_cfg (dict): Config of convolution. Default: dict(type='Conv1d'). norm_cfg (dict): Config of normalization. Default: dict(type='BN1d'). norm_feats (bool): Whether to normalize features. Default: True. with_res_feat (bool): Whether to predict residual features. Default: True. vote_xyz_range (list[float], None): The range of points translation. vote_loss (dict): Config of vote loss. """ def __init__(self, in_channels, vote_per_seed=1, gt_per_seed=3, num_points=-1, conv_channels=(16, 16), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), norm_feats=True, with_res_feat=True, vote_xyz_range=None, vote_loss=None): super().__init__() self.in_channels = in_channels self.vote_per_seed = vote_per_seed self.gt_per_seed = gt_per_seed self.num_points = num_points self.norm_feats = norm_feats self.with_res_feat = with_res_feat assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float) self.vote_xyz_range = vote_xyz_range if vote_loss is not None: self.vote_loss = build_loss(vote_loss) prev_channels = in_channels vote_conv_list = list() for k in range(len(conv_channels)): vote_conv_list.append( ConvModule( prev_channels, conv_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) prev_channels = conv_channels[k] self.vote_conv = nn.Sequential(*vote_conv_list) # conv_out predicts coordinate and residual features if with_res_feat: out_channel = (3 + in_channels) * self.vote_per_seed else: out_channel = 3 * self.vote_per_seed self.conv_out = nn.Conv1d(prev_channels, out_channel, 1) def forward(self, seed_points, seed_feats): """forward. Args: seed_points (torch.Tensor): Coordinate of the seed points in shape (B, N, 3). seed_feats (torch.Tensor): Features of the seed points in shape (B, C, N). Returns: tuple[torch.Tensor]: - vote_points: Voted xyz based on the seed points \ with shape (B, M, 3), ``M=num_seed*vote_per_seed``. - vote_features: Voted features based on the seed points with \ shape (B, C, M) where ``M=num_seed*vote_per_seed``, \ ``C=vote_feature_dim``. """ if self.num_points != -1: assert self.num_points < seed_points.shape[1], \ f'Number of vote points ({self.num_points}) should be '\ f'smaller than seed points size ({seed_points.shape[1]})' seed_points = seed_points[:, :self.num_points] seed_feats = seed_feats[..., :self.num_points] batch_size, feat_channels, num_seed = seed_feats.shape num_vote = num_seed * self.vote_per_seed x = self.vote_conv(seed_feats) # (batch_size, (3+out_dim)*vote_per_seed, num_seed) votes = self.conv_out(x) votes = votes.transpose(2, 1).view(batch_size, num_seed, self.vote_per_seed, -1) offset = votes[:, :, :, 0:3] if self.vote_xyz_range is not None: limited_offset_list = [] for axis in range(len(self.vote_xyz_range)): limited_offset_list.append(offset[..., axis].clamp( min=-self.vote_xyz_range[axis], max=self.vote_xyz_range[axis])) limited_offset = torch.stack(limited_offset_list, -1) vote_points = (seed_points.unsqueeze(2) + limited_offset).contiguous() else: vote_points = (seed_points.unsqueeze(2) + offset).contiguous() vote_points = vote_points.view(batch_size, num_vote, 3) offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1) if self.with_res_feat: res_feats = votes[:, :, :, 3:] vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) + res_feats).contiguous() vote_feats = vote_feats.view(batch_size, num_vote, feat_channels).transpose( 2, 1).contiguous() if self.norm_feats: features_norm = torch.norm(vote_feats, p=2, dim=1) vote_feats = vote_feats.div(features_norm.unsqueeze(1)) else: vote_feats = seed_feats return vote_points, vote_feats, offset def get_loss(self, seed_points, vote_points, seed_indices, vote_targets_mask, vote_targets): """Calculate loss of voting module. Args: seed_points (torch.Tensor): Coordinate of the seed points. vote_points (torch.Tensor): Coordinate of the vote points. seed_indices (torch.Tensor): Indices of seed points in raw points. vote_targets_mask (torch.Tensor): Mask of valid vote targets. vote_targets (torch.Tensor): Targets of votes. Returns: torch.Tensor: Weighted vote loss. """ batch_size, num_seed = seed_points.shape[:2] seed_gt_votes_mask = torch.gather(vote_targets_mask, 1, seed_indices).float() seed_indices_expand = seed_indices.unsqueeze(-1).repeat( 1, 1, 3 * self.gt_per_seed) seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand) seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed) weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6) distance = self.vote_loss( vote_points.view(batch_size * num_seed, -1, 3), seed_gt_votes.view(batch_size * num_seed, -1, 3), dst_weight=weight.view(batch_size * num_seed, 1))[1] vote_loss = torch.sum(torch.min(distance, dim=1)[0]) return vote_loss ================================================ FILE: mmdet3d/models/necks/__init__.py ================================================ from mmdet.models.necks.fpn import FPN from .second_fpn import SECONDFPN __all__ = ['FPN', 'SECONDFPN'] ================================================ FILE: mmdet3d/models/necks/second_fpn.py ================================================ import numpy as np import torch from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, constant_init, is_norm, kaiming_init) from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet.models import NECKS @NECKS.register_module() class SECONDFPN(nn.Module): """FPN used in SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (list[int]): Input channels of multi-scale feature maps. out_channels (list[int]): Output channels of feature maps. upsample_strides (list[int]): Strides used to upsample the feature maps. norm_cfg (dict): Config dict of normalization layers. upsample_cfg (dict): Config dict of upsample layers. conv_cfg (dict): Config dict of conv layers. use_conv_for_no_stride (bool): Whether to use conv when stride is 1. """ def __init__(self, in_channels=[128, 128, 256], out_channels=[256, 256, 256], upsample_strides=[1, 2, 4], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), conv_cfg=dict(type='Conv2d', bias=False), use_conv_for_no_stride=False): # if for GroupNorm, # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True) super(SECONDFPN, self).__init__() assert len(out_channels) == len(upsample_strides) == len(in_channels) self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False deblocks = [] for i, out_channel in enumerate(out_channels): stride = upsample_strides[i] if stride > 1 or (stride == 1 and not use_conv_for_no_stride): upsample_layer = build_upsample_layer( upsample_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=upsample_strides[i], stride=upsample_strides[i]) else: stride = np.round(1 / stride).astype(np.int64) upsample_layer = build_conv_layer( conv_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=stride, stride=stride) deblock = nn.Sequential(upsample_layer, build_norm_layer(norm_cfg, out_channel)[1], nn.ReLU(inplace=True)) deblocks.append(deblock) self.deblocks = nn.ModuleList(deblocks) def init_weights(self): """Initialize weights of FPN.""" for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif is_norm(m): constant_init(m, 1) @auto_fp16() def forward(self, x): """Forward function. Args: x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. Returns: list[torch.Tensor]: Multi-level feature maps. """ assert len(x) == len(self.in_channels) ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)] if len(ups) > 1: out = torch.cat(ups, dim=1) else: out = ups[0] return [out] ================================================ FILE: mmdet3d/models/registry.py ================================================ from mmcv.utils import Registry VOXEL_ENCODERS = Registry('voxel_encoder') MIDDLE_ENCODERS = Registry('middle_encoder') FUSION_LAYERS = Registry('fusion_layer') # ACTIVATION_LAYERS = Registry('activation layer') DROPOUT_LAYERS = Registry('drop out layers') POSITIONAL_ENCODING = Registry('position encoding') ATTENTION = Registry('attention') FEEDFORWARD_NETWORK = Registry('feed-forward Network') TRANSFORMER_LAYER = Registry('transformerLayer') TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') ================================================ FILE: mmdet3d/models/roi_heads/__init__.py ================================================ from .base_3droi_head import Base3DRoIHead from .bbox_heads import PartA2BboxHead from .h3d_roi_head import H3DRoIHead from .mask_heads import PointwiseSemanticHead, PrimitiveHead from .part_aggregation_roi_head import PartAggregationROIHead from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor __all__ = [ 'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead', 'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor', 'H3DRoIHead', 'PrimitiveHead' ] ================================================ FILE: mmdet3d/models/roi_heads/base_3droi_head.py ================================================ from abc import ABCMeta, abstractmethod from torch import nn as nn class Base3DRoIHead(nn.Module, metaclass=ABCMeta): """Base class for 3d RoIHeads.""" def __init__(self, bbox_head=None, mask_roi_extractor=None, mask_head=None, train_cfg=None, test_cfg=None): super(Base3DRoIHead, self).__init__() self.train_cfg = train_cfg self.test_cfg = test_cfg if bbox_head is not None: self.init_bbox_head(bbox_head) if mask_head is not None: self.init_mask_head(mask_roi_extractor, mask_head) self.init_assigner_sampler() @property def with_bbox(self): """bool: whether the RoIHead has box head""" return hasattr(self, 'bbox_head') and self.bbox_head is not None @property def with_mask(self): """bool: whether the RoIHead has mask head""" return hasattr(self, 'mask_head') and self.mask_head is not None @abstractmethod def init_weights(self, pretrained): """Initialize the module with pre-trained weights.""" pass @abstractmethod def init_bbox_head(self): """Initialize the box head.""" pass @abstractmethod def init_mask_head(self): """Initialize maek head.""" pass @abstractmethod def init_assigner_sampler(self): """Initialize assigner and sampler.""" pass @abstractmethod def forward_train(self, x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore=None, **kwargs): """Forward function during training. Args: x (dict): Contains features from the first stage. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): GT bboxes of each sample. The bboxes are encapsulated by 3D box structures. gt_labels (list[torch.LongTensor]): GT labels of each sample. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Returns: dict[str, torch.Tensor]: Losses from each head. """ pass def simple_test(self, x, proposal_list, img_metas, proposals=None, rescale=False, **kwargs): """Test without augmentation.""" pass def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/__init__.py ================================================ from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead, DoubleConvFCBBoxHead, Shared2FCBBoxHead, Shared4Conv1FCBBoxHead) from .h3d_bbox_head import H3DBboxHead from .parta2_bbox_head import PartA2BboxHead __all__ = [ 'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead', 'H3DBboxHead' ] ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py ================================================ import torch from mmcv.cnn import ConvModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet3d.core.post_processing import aligned_3d_nms from mmdet3d.models.builder import build_loss from mmdet3d.models.losses import chamfer_distance from mmdet3d.ops import build_sa_module from mmdet.core import build_bbox_coder, multi_apply from mmdet.models import HEADS @HEADS.register_module() class H3DBboxHead(nn.Module): r"""Bbox head of `H3DNet `_. Args: num_classes (int): The number of classes. suface_matching_cfg (dict): Config for suface primitive matching. line_matching_cfg (dict): Config for line primitive matching. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. gt_per_seed (int): Number of ground truth votes generated from each seed point. num_proposal (int): Number of proposal votes generated. feat_channels (tuple[int]): Convolution channels of prediction layer. primitive_feat_refine_streams (int): The number of mlps to refine primitive feature. primitive_refine_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. surface_thresh (float): Threshold for suface matching. line_thresh (float): Threshold for line matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_class_loss (dict): Config of size classification loss. size_res_loss (dict): Config of size residual regression loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. cues_objectness_loss (dict): Config of cues objectness loss. cues_semantic_loss (dict): Config of cues semantic loss. proposal_objectness_loss (dict): Config of proposal objectness loss. primitive_center_loss (dict): Config of primitive center regression loss. """ def __init__(self, num_classes, suface_matching_cfg, line_matching_cfg, bbox_coder, train_cfg=None, test_cfg=None, gt_per_seed=1, num_proposal=256, feat_channels=(128, 128), primitive_feat_refine_streams=2, primitive_refine_channels=[128, 128, 128], upper_thresh=100.0, surface_thresh=0.5, line_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, semantic_loss=None, cues_objectness_loss=None, cues_semantic_loss=None, proposal_objectness_loss=None, primitive_center_loss=None): super(H3DBboxHead, self).__init__() self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = gt_per_seed self.num_proposal = num_proposal self.with_angle = bbox_coder['with_rot'] self.upper_thresh = upper_thresh self.surface_thresh = surface_thresh self.line_thresh = line_thresh self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_class_loss = build_loss(dir_class_loss) self.dir_res_loss = build_loss(dir_res_loss) self.size_class_loss = build_loss(size_class_loss) self.size_res_loss = build_loss(size_res_loss) self.semantic_loss = build_loss(semantic_loss) self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins self.cues_objectness_loss = build_loss(cues_objectness_loss) self.cues_semantic_loss = build_loss(cues_semantic_loss) self.proposal_objectness_loss = build_loss(proposal_objectness_loss) self.primitive_center_loss = build_loss(primitive_center_loss) assert suface_matching_cfg['mlp_channels'][-1] == \ line_matching_cfg['mlp_channels'][-1] # surface center matching self.surface_center_matcher = build_sa_module(suface_matching_cfg) # line center matching self.line_center_matcher = build_sa_module(line_matching_cfg) # Compute the matching scores matching_feat_dims = suface_matching_cfg['mlp_channels'][-1] self.matching_conv = ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1) # Compute the semantic matching scores self.semantic_matching_conv = ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1) # Surface feature aggregation self.surface_feats_aggregation = list() for k in range(primitive_feat_refine_streams): self.surface_feats_aggregation.append( ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) self.surface_feats_aggregation = nn.Sequential( *self.surface_feats_aggregation) # Line feature aggregation self.line_feats_aggregation = list() for k in range(primitive_feat_refine_streams): self.line_feats_aggregation.append( ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) self.line_feats_aggregation = nn.Sequential( *self.line_feats_aggregation) # surface center(6) + line center(12) prev_channel = 18 * matching_feat_dims self.bbox_pred = nn.ModuleList() for k in range(len(primitive_refine_channels)): self.bbox_pred.append( ConvModule( prev_channel, primitive_refine_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=False)) prev_channel = primitive_refine_channels[k] # Final object detection # Objectness scores (2), center residual (3), # heading class+residual (num_heading_bin*2), size class + # residual(num_size_cluster*4) conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 + bbox_coder['num_sizes'] * 4 + self.num_classes) self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1)) def init_weights(self, pretrained=None): """Initialize the weights in detector. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ pass def forward(self, feats_dict, sample_mod): """Forward pass. Args: feats_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed" and "random". Returns: dict: Predictions of vote head. """ ret_dict = {} aggregated_points = feats_dict['aggregated_points'] original_feature = feats_dict['aggregated_features'] batch_size = original_feature.shape[0] object_proposal = original_feature.shape[2] # Extract surface center, features and semantic predictions z_center = feats_dict['pred_z_center'] xy_center = feats_dict['pred_xy_center'] z_semantic = feats_dict['sem_cls_scores_z'] xy_semantic = feats_dict['sem_cls_scores_xy'] z_feature = feats_dict['aggregated_features_z'] xy_feature = feats_dict['aggregated_features_xy'] # Extract line points and features line_center = feats_dict['pred_line_center'] line_feature = feats_dict['aggregated_features_line'] surface_center_pred = torch.cat((z_center, xy_center), dim=1) ret_dict['surface_center_pred'] = surface_center_pred ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic), dim=1) # Extract the surface and line centers of rpn proposals rpn_proposals = feats_dict['proposal_list'] rpn_proposals_bbox = DepthInstance3DBoxes( rpn_proposals.reshape(-1, 7).clone(), box_dim=rpn_proposals.shape[-1], with_yaw=self.with_angle, origin=(0.5, 0.5, 0.5)) obj_surface_center, obj_line_center = \ rpn_proposals_bbox.get_surface_line_center() obj_surface_center = obj_surface_center.reshape( batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3) obj_line_center = obj_line_center.reshape(batch_size, -1, 12, 3).transpose(1, 2).reshape( batch_size, -1, 3) ret_dict['surface_center_object'] = obj_surface_center ret_dict['line_center_object'] = obj_line_center # aggregate primitive z and xy features to rpn proposals surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2) surface_center_feature_pred = torch.cat( (surface_center_feature_pred.new_zeros( (batch_size, 6, surface_center_feature_pred.shape[2])), surface_center_feature_pred), dim=1) surface_xyz, surface_features, _ = self.surface_center_matcher( surface_center_pred, surface_center_feature_pred, target_xyz=obj_surface_center) # aggregate primitive line features to rpn proposals line_feature = torch.cat((line_feature.new_zeros( (batch_size, 12, line_feature.shape[2])), line_feature), dim=1) line_xyz, line_features, _ = self.line_center_matcher( line_center, line_feature, target_xyz=obj_line_center) # combine the surface and line features combine_features = torch.cat((surface_features, line_features), dim=2) matching_features = self.matching_conv(combine_features) matching_score = self.matching_pred(matching_features) ret_dict['matching_score'] = matching_score.transpose(2, 1) semantic_matching_features = self.semantic_matching_conv( combine_features) semantic_matching_score = self.semantic_matching_pred( semantic_matching_features) ret_dict['semantic_matching_score'] = \ semantic_matching_score.transpose(2, 1) surface_features = self.surface_feats_aggregation(surface_features) line_features = self.line_feats_aggregation(line_features) # Combine all surface and line features surface_features = surface_features.view(batch_size, -1, object_proposal) line_features = line_features.view(batch_size, -1, object_proposal) combine_feature = torch.cat((surface_features, line_features), dim=1) # Final bbox predictions bbox_predictions = self.bbox_pred[0](combine_feature) bbox_predictions += original_feature for conv_module in self.bbox_pred[1:]: bbox_predictions = conv_module(bbox_predictions) refine_decode_res = self.bbox_coder.split_pred( bbox_predictions[:, :self.num_classes + 2], bbox_predictions[:, self.num_classes + 2:], aggregated_points) for key in refine_decode_res.keys(): ret_dict[key + '_optimized'] = refine_decode_res[key] return ret_dict def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, rpn_targets=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of h3d bbox head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. rpn_targets (Tuple) : Targets generated by rpn head. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of H3dnet. """ (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) = rpn_targets losses = {} # calculate refined proposal loss refined_proposal_loss = self.get_proposal_stage_loss( bbox_preds, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, mask_targets, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights, suffix='_optimized') for key in refined_proposal_loss.keys(): losses[key + '_optimized'] = refined_proposal_loss[key] bbox3d_optimized = self.bbox_coder.decode( bbox_preds, suffix='_optimized') targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) = targets # match scores for each geometric primitive objectness_scores = bbox_preds['matching_score'] # match scores for the semantics of primitives objectness_scores_sem = bbox_preds['semantic_matching_score'] primitive_objectness_loss = self.cues_objectness_loss( objectness_scores.transpose(2, 1), cues_objectness_label, weight=cues_mask, avg_factor=cues_mask.sum() + 1e-6) primitive_sem_loss = self.cues_semantic_loss( objectness_scores_sem.transpose(2, 1), cues_sem_label, weight=cues_mask, avg_factor=cues_mask.sum() + 1e-6) objectness_scores = bbox_preds['obj_scores_optimized'] objectness_loss_refine = self.proposal_objectness_loss( objectness_scores.transpose(2, 1), proposal_objectness_label) primitive_matching_loss = (objectness_loss_refine * cues_match_mask).sum() / ( cues_match_mask.sum() + 1e-6) * 0.5 primitive_sem_matching_loss = ( objectness_loss_refine * proposal_objectness_mask).sum() / ( proposal_objectness_mask.sum() + 1e-6) * 0.5 # Get the object surface center here batch_size, object_proposal = bbox3d_optimized.shape[:2] refined_bbox = DepthInstance3DBoxes( bbox3d_optimized.reshape(-1, 7).clone(), box_dim=bbox3d_optimized.shape[-1], with_yaw=self.with_angle, origin=(0.5, 0.5, 0.5)) pred_obj_surface_center, pred_obj_line_center = \ refined_bbox.get_surface_line_center() pred_obj_surface_center = pred_obj_surface_center.reshape( batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3) pred_obj_line_center = pred_obj_line_center.reshape( batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3) pred_surface_line_center = torch.cat( (pred_obj_surface_center, pred_obj_line_center), 1) square_dist = self.primitive_center_loss(pred_surface_line_center, obj_surface_line_center) match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6) primitive_centroid_reg_loss = torch.sum( match_dist * cues_matching_label) / ( cues_matching_label.sum() + 1e-6) refined_loss = dict( primitive_objectness_loss=primitive_objectness_loss, primitive_sem_loss=primitive_sem_loss, primitive_matching_loss=primitive_matching_loss, primitive_sem_matching_loss=primitive_sem_matching_loss, primitive_centroid_reg_loss=primitive_centroid_reg_loss) losses.update(refined_loss) return losses def get_bboxes(self, points, bbox_preds, input_metas, rescale=False, suffix=''): """Generate bboxes from vote head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from vote head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes obj_scores = F.softmax( bbox_preds['obj_scores' + suffix], dim=-1)[..., -1] sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1) prediction_collection = {} prediction_collection['center'] = bbox_preds['center' + suffix] prediction_collection['dir_class'] = bbox_preds['dir_class'] prediction_collection['dir_res'] = bbox_preds['dir_res' + suffix] prediction_collection['size_class'] = bbox_preds['size_class'] prediction_collection['size_res'] = bbox_preds['size_res' + suffix] bbox3d = self.bbox_coder.decode(prediction_collection) batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = self.multiclass_nms_single( obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected, box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) box_indices = bbox.points_in_boxes(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] nonempty_box_mask = box_indices.T.sum(1) > 5 bbox_classes = torch.argmax(sem_scores, -1) nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_thr) # filter empty boxes and boxes with low score scores_mask = (obj_scores > self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected] * sem_scores[selected][:, k]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels def get_proposal_stage_loss(self, bbox_preds, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, mask_targets, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights, suffix=''): """Compute loss for the aggregation module. Args: bbox_preds (dict): Predictions from forward of vote head. size_class_targets (torch.Tensor): Ground truth \ size class of each prediction bounding box. size_res_targets (torch.Tensor): Ground truth \ size residual of each prediction bounding box. dir_class_targets (torch.Tensor): Ground truth \ direction class of each prediction bounding box. dir_res_targets (torch.Tensor): Ground truth \ direction residual of each prediction bounding box. center_targets (torch.Tensor): Ground truth center \ of each prediction bounding box. mask_targets (torch.Tensor): Validation of each \ prediction bounding box. objectness_targets (torch.Tensor): Ground truth \ objectness label of each prediction bounding box. objectness_weights (torch.Tensor): Weights of objectness \ loss for each prediction bounding box. box_loss_weights (torch.Tensor): Weights of regression \ loss for each prediction bounding box. valid_gt_weights (torch.Tensor): Validation of each \ ground truth bounding box. Returns: dict: Losses of aggregation module. """ # calculate objectness loss objectness_loss = self.objectness_loss( bbox_preds['obj_scores' + suffix].transpose(2, 1), objectness_targets, weight=objectness_weights) # calculate center loss source2target_loss, target2source_loss = self.center_loss( bbox_preds['center' + suffix], center_targets, src_weight=box_loss_weights, dst_weight=valid_gt_weights) center_loss = source2target_loss + target2source_loss # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class' + suffix].transpose(2, 1), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss batch_size, proposal_num = size_class_targets.shape[:2] heading_label_one_hot = dir_class_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) dir_res_norm = (bbox_preds['dir_res_norm' + suffix] * heading_label_one_hot).sum(dim=-1) dir_res_loss = self.dir_res_loss( dir_res_norm, dir_res_targets, weight=box_loss_weights) # calculate size class loss size_class_loss = self.size_class_loss( bbox_preds['size_class' + suffix].transpose(2, 1), size_class_targets, weight=box_loss_weights) # calculate size residual loss one_hot_size_targets = box_loss_weights.new_zeros( (batch_size, proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets_expand = one_hot_size_targets.unsqueeze( -1).repeat(1, 1, 1, 3) size_residual_norm = (bbox_preds['size_res_norm' + suffix] * one_hot_size_targets_expand).sum(dim=2) box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat( 1, 1, 3) size_res_loss = self.size_res_loss( size_residual_norm, size_res_targets, weight=box_loss_weights_expand) # calculate semantic loss semantic_loss = self.semantic_loss( bbox_preds['sem_scores' + suffix].transpose(2, 1), mask_targets, weight=box_loss_weights) losses = dict( objectness_loss=objectness_loss, semantic_loss=semantic_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=size_class_loss, size_res_loss=size_res_loss) return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of proposal module. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. Returns: tuple[torch.Tensor]: Targets of proposal module. """ # find empty example valid_gt_masks = list() gt_num = list() for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) valid_gt_masks.append(gt_labels_3d[index].new_zeros(1)) gt_num.append(1) else: valid_gt_masks.append(gt_labels_3d[index].new_ones( gt_labels_3d[index].shape)) gt_num.append(gt_labels_3d[index].shape[0]) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] surface_center_pred = [ bbox_preds['surface_center_pred'][i] for i in range(len(gt_labels_3d)) ] line_center_pred = [ bbox_preds['pred_line_center'][i] for i in range(len(gt_labels_3d)) ] surface_center_object = [ bbox_preds['surface_center_object'][i] for i in range(len(gt_labels_3d)) ] line_center_object = [ bbox_preds['line_center_object'][i] for i in range(len(gt_labels_3d)) ] surface_sem_pred = [ bbox_preds['surface_sem_pred'][i] for i in range(len(gt_labels_3d)) ] line_sem_pred = [ bbox_preds['sem_cls_scores_line'][i] for i in range(len(gt_labels_3d)) ] (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) = multi_apply( self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points, surface_center_pred, line_center_pred, surface_center_object, line_center_object, surface_sem_pred, line_sem_pred) cues_objectness_label = torch.stack(cues_objectness_label) cues_sem_label = torch.stack(cues_sem_label) proposal_objectness_label = torch.stack(proposal_objectness_label) cues_mask = torch.stack(cues_mask) cues_match_mask = torch.stack(cues_match_mask) proposal_objectness_mask = torch.stack(proposal_objectness_mask) cues_matching_label = torch.stack(cues_matching_label) obj_surface_line_center = torch.stack(obj_surface_line_center) return (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None, pred_surface_center=None, pred_line_center=None, pred_obj_surface_center=None, pred_obj_line_center=None, pred_surface_sem=None, pred_line_sem=None): """Generate targets for primitive cues for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (None | torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (None | torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. pred_surface_center (torch.Tensor): Prediction of surface center. pred_line_center (torch.Tensor): Prediction of line center. pred_obj_surface_center (torch.Tensor): Objectness prediction \ of surface center. pred_obj_line_center (torch.Tensor): Objectness prediction of \ line center. pred_surface_sem (torch.Tensor): Semantic prediction of \ surface center. pred_line_sem (torch.Tensor): Semantic prediction of line center. Returns: tuple[torch.Tensor]: Targets for primitive cues. """ device = points.device gt_bboxes_3d = gt_bboxes_3d.to(device) num_proposals = aggregated_points.shape[0] gt_center = gt_bboxes_3d.gravity_center dist1, dist2, ind1, _ = chamfer_distance( aggregated_points.unsqueeze(0), gt_center.unsqueeze(0), reduction='none') # Set assignment object_assignment = ind1.squeeze(0) # Generate objectness label and mask # objectness_label: 1 if pred object center is within # self.train_cfg['near_threshold'] of any GT object # objectness_mask: 0 if pred object center is in gray # zone (DONOTCARE), 1 otherwise euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6) proposal_objectness_label = euclidean_dist1.new_zeros( num_proposals, dtype=torch.long) proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals) gt_sem = gt_labels_3d[object_assignment] obj_surface_center, obj_line_center = \ gt_bboxes_3d.get_surface_line_center() obj_surface_center = obj_surface_center.reshape(-1, 6, 3).transpose(0, 1) obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1) obj_surface_center = obj_surface_center[:, object_assignment].reshape( 1, -1, 3) obj_line_center = obj_line_center[:, object_assignment].reshape(1, -1, 3) surface_sem = torch.argmax(pred_surface_sem, dim=1).float() line_sem = torch.argmax(pred_line_sem, dim=1).float() dist_surface, _, surface_ind, _ = chamfer_distance( obj_surface_center, pred_surface_center.unsqueeze(0), reduction='none') dist_line, _, line_ind, _ = chamfer_distance( obj_line_center, pred_line_center.unsqueeze(0), reduction='none') surface_sel = pred_surface_center[surface_ind.squeeze(0)] line_sel = pred_line_center[line_ind.squeeze(0)] surface_sel_sem = surface_sem[surface_ind.squeeze(0)] line_sel_sem = line_sem[line_ind.squeeze(0)] surface_sel_sem_gt = gt_sem.repeat(6).float() line_sel_sem_gt = gt_sem.repeat(12).float() euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6) euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6) objectness_label_surface = euclidean_dist_line.new_zeros( num_proposals * 6, dtype=torch.long) objectness_mask_surface = euclidean_dist_line.new_zeros(num_proposals * 6) objectness_label_line = euclidean_dist_line.new_zeros( num_proposals * 12, dtype=torch.long) objectness_mask_line = euclidean_dist_line.new_zeros(num_proposals * 12) objectness_label_surface_sem = euclidean_dist_line.new_zeros( num_proposals * 6, dtype=torch.long) objectness_label_line_sem = euclidean_dist_line.new_zeros( num_proposals * 12, dtype=torch.long) euclidean_dist_obj_surface = torch.sqrt(( (pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6) euclidean_dist_obj_line = torch.sqrt( torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6) # Objectness score just with centers proposal_objectness_label[ euclidean_dist1 < self.train_cfg['near_threshold']] = 1 proposal_objectness_mask[ euclidean_dist1 < self.train_cfg['near_threshold']] = 1 proposal_objectness_mask[ euclidean_dist1 > self.train_cfg['far_threshold']] = 1 objectness_label_surface[ (euclidean_dist_obj_surface < self.train_cfg['label_surface_threshold']) * (euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])] = 1 objectness_label_surface_sem[ (euclidean_dist_obj_surface < self.train_cfg['label_surface_threshold']) * (euclidean_dist_surface < self.train_cfg['mask_surface_threshold']) * (surface_sel_sem == surface_sel_sem_gt)] = 1 objectness_label_line[ (euclidean_dist_obj_line < self.train_cfg['label_line_threshold']) * (euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1 objectness_label_line_sem[ (euclidean_dist_obj_line < self.train_cfg['label_line_threshold']) * (euclidean_dist_line < self.train_cfg['mask_line_threshold']) * (line_sel_sem == line_sel_sem_gt)] = 1 objectness_label_surface_obj = proposal_objectness_label.repeat(6) objectness_mask_surface_obj = proposal_objectness_mask.repeat(6) objectness_label_line_obj = proposal_objectness_label.repeat(12) objectness_mask_line_obj = proposal_objectness_mask.repeat(12) objectness_mask_surface = objectness_mask_surface_obj objectness_mask_line = objectness_mask_line_obj cues_objectness_label = torch.cat( (objectness_label_surface, objectness_label_line), 0) cues_sem_label = torch.cat( (objectness_label_surface_sem, objectness_label_line_sem), 0) cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line), 0) objectness_label_surface *= objectness_label_surface_obj objectness_label_line *= objectness_label_line_obj cues_matching_label = torch.cat( (objectness_label_surface, objectness_label_line), 0) objectness_label_surface_sem *= objectness_label_surface_obj objectness_label_line_sem *= objectness_label_line_obj cues_match_mask = (torch.sum( cues_objectness_label.view(18, num_proposals), dim=0) >= 1).float() obj_surface_line_center = torch.cat( (obj_surface_center, obj_line_center), 1).squeeze(0) return (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py ================================================ import numpy as np import torch from mmcv.cnn import ConvModule, normal_init, xavier_init from torch import nn as nn from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes, rotation_3d_in_axis, xywhr2xyxyr) from mmdet3d.models.builder import build_loss from mmdet3d.ops import make_sparse_convmodule from mmdet3d.ops import spconv as spconv from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu from mmdet.core import build_bbox_coder, multi_apply from mmdet.models import HEADS @HEADS.register_module() class PartA2BboxHead(nn.Module): """PartA2 RoI head. Args: num_classes (int): The number of classes to prediction. seg_in_channels (int): Input channels of segmentation convolution layer. part_in_channels (int): Input channels of part convolution layer. seg_conv_channels (list(int)): Out channels of each segmentation convolution layer. part_conv_channels (list(int)): Out channels of each part convolution layer. merge_conv_channels (list(int)): Out channels of each feature merged convolution layer. down_conv_channels (list(int)): Out channels of each downsampled convolution layer. shared_fc_channels (list(int)): Out channels of each shared fc layer. cls_channels (list(int)): Out channels of each classification layer. reg_channels (list(int)): Out channels of each regression layer. dropout_ratio (float): Dropout ratio of classification and regression layers. roi_feat_size (int): The size of pooled roi features. with_corner_loss (bool): Whether to use corner loss or not. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head. conv_cfg (dict): Config dict of convolutional layers norm_cfg (dict): Config dict of normalization layers loss_bbox (dict): Config dict of box regression loss. loss_cls (dict): Config dict of classifacation loss. """ def __init__(self, num_classes, seg_in_channels, part_in_channels, seg_conv_channels=None, part_conv_channels=None, merge_conv_channels=None, down_conv_channels=None, shared_fc_channels=None, cls_channels=None, reg_channels=None, dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='none', loss_weight=1.0)): super(PartA2BboxHead, self).__init__() self.num_classes = num_classes self.with_corner_loss = with_corner_loss self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_bbox = build_loss(loss_bbox) self.loss_cls = build_loss(loss_cls) self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) assert down_conv_channels[-1] == shared_fc_channels[0] # init layers part_channel_last = part_in_channels part_conv = [] for i, channel in enumerate(part_conv_channels): part_conv.append( make_sparse_convmodule( part_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key=f'rcnn_part{i}', conv_type='SubMConv3d')) part_channel_last = channel self.part_conv = spconv.SparseSequential(*part_conv) seg_channel_last = seg_in_channels seg_conv = [] for i, channel in enumerate(seg_conv_channels): seg_conv.append( make_sparse_convmodule( seg_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key=f'rcnn_seg{i}', conv_type='SubMConv3d')) seg_channel_last = channel self.seg_conv = spconv.SparseSequential(*seg_conv) self.conv_down = spconv.SparseSequential() merge_conv_channel_last = part_channel_last + seg_channel_last merge_conv = [] for i, channel in enumerate(merge_conv_channels): merge_conv.append( make_sparse_convmodule( merge_conv_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key='rcnn_down0')) merge_conv_channel_last = channel down_conv_channel_last = merge_conv_channel_last conv_down = [] for i, channel in enumerate(down_conv_channels): conv_down.append( make_sparse_convmodule( down_conv_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key='rcnn_down1')) down_conv_channel_last = channel self.conv_down.add_module('merge_conv', spconv.SparseSequential(*merge_conv)) self.conv_down.add_module( 'max_pool3d', spconv.SparseMaxPool3d(kernel_size=2, stride=2)) self.conv_down.add_module('down_conv', spconv.SparseSequential(*conv_down)) shared_fc_list = [] pool_size = roi_feat_size // 2 pre_channel = shared_fc_channels[0] * pool_size**3 for k in range(1, len(shared_fc_channels)): shared_fc_list.append( ConvModule( pre_channel, shared_fc_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = shared_fc_channels[k] if k != len(shared_fc_channels) - 1 and dropout_ratio > 0: shared_fc_list.append(nn.Dropout(dropout_ratio)) self.shared_fc = nn.Sequential(*shared_fc_list) # Classification layer channel_in = shared_fc_channels[-1] cls_channel = 1 cls_layers = [] pre_channel = channel_in for k in range(0, len(cls_channels)): cls_layers.append( ConvModule( pre_channel, cls_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = cls_channels[k] cls_layers.append( ConvModule( pre_channel, cls_channel, 1, padding=0, conv_cfg=conv_cfg, act_cfg=None)) if dropout_ratio >= 0: cls_layers.insert(1, nn.Dropout(dropout_ratio)) self.conv_cls = nn.Sequential(*cls_layers) # Regression layer reg_layers = [] pre_channel = channel_in for k in range(0, len(reg_channels)): reg_layers.append( ConvModule( pre_channel, reg_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = reg_channels[k] reg_layers.append( ConvModule( pre_channel, self.bbox_coder.code_size, 1, padding=0, conv_cfg=conv_cfg, act_cfg=None)) if dropout_ratio >= 0: reg_layers.insert(1, nn.Dropout(dropout_ratio)) self.conv_reg = nn.Sequential(*reg_layers) self.init_weights() def init_weights(self): """Initialize weights of the bbox head.""" for m in self.modules(): if isinstance(m, (nn.Conv2d, nn.Conv1d)): xavier_init(m, distribution='uniform') normal_init(self.conv_reg[-1].conv, mean=0, std=0.001) def forward(self, seg_feats, part_feats): """Forward pass. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. Returns: tuple[torch.Tensor]: Score of class and bbox predictions. """ # (B * N, out_x, out_y, out_z, 4) rcnn_batch_size = part_feats.shape[0] # transform to sparse tensors sparse_shape = part_feats.shape[1:4] # (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx] sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False) part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1], sparse_idx[:, 2], sparse_idx[:, 3]] seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1], sparse_idx[:, 2], sparse_idx[:, 3]] coords = sparse_idx.int() part_features = spconv.SparseConvTensor(part_features, coords, sparse_shape, rcnn_batch_size) seg_features = spconv.SparseConvTensor(seg_features, coords, sparse_shape, rcnn_batch_size) # forward rcnn network x_part = self.part_conv(part_features) x_rpn = self.seg_conv(seg_features) merged_feature = torch.cat((x_rpn.features, x_part.features), dim=1) # (N, C) shared_feature = spconv.SparseConvTensor(merged_feature, coords, sparse_shape, rcnn_batch_size) x = self.conv_down(shared_feature) shared_feature = x.dense().view(rcnn_batch_size, -1, 1) shared_feature = self.shared_fc(shared_feature) cls_score = self.conv_cls(shared_feature).transpose( 1, 2).contiguous().squeeze(dim=1) # (B, 1) bbox_pred = self.conv_reg(shared_feature).transpose( 1, 2).contiguous().squeeze(dim=1) # (B, C) return cls_score, bbox_pred def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights): """Coumputing losses. Args: cls_score (torch.Tensor): Scores of each roi. bbox_pred (torch.Tensor): Predictions of bboxes. rois (torch.Tensor): Roi bboxes. labels (torch.Tensor): Labels of class. bbox_targets (torch.Tensor): Target of positive bboxes. pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes. reg_mask (torch.Tensor): Mask for positive bboxes. label_weights (torch.Tensor): Weights of class loss. bbox_weights (torch.Tensor): Weights of bbox loss. Returns: dict: Computed losses. - loss_cls (torch.Tensor): Loss of classes. - loss_bbox (torch.Tensor): Loss of bboxes. - loss_corner (torch.Tensor): Loss of corners. """ losses = dict() rcnn_batch_size = cls_score.shape[0] # calculate class loss cls_flat = cls_score.view(-1) loss_cls = self.loss_cls(cls_flat, labels, label_weights) losses['loss_cls'] = loss_cls # calculate regression loss code_size = self.bbox_coder.code_size pos_inds = (reg_mask > 0) if pos_inds.any() == 0: # fake a part loss losses['loss_bbox'] = loss_cls.new_tensor(0) if self.with_corner_loss: losses['loss_corner'] = loss_cls.new_tensor(0) else: pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds] bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat( 1, pos_bbox_pred.shape[-1]) loss_bbox = self.loss_bbox( pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0), bbox_weights_flat.unsqueeze(dim=0)) losses['loss_bbox'] = loss_bbox if self.with_corner_loss: pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds] pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size) batch_anchors = pos_roi_boxes3d.clone().detach() pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1) roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3) batch_anchors[..., 0:3] = 0 # decode boxes pred_boxes3d = self.bbox_coder.decode( batch_anchors, pos_bbox_pred.view(-1, code_size)).view(-1, code_size) pred_boxes3d[..., 0:3] = rotation_3d_in_axis( pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation + np.pi / 2), axis=2).squeeze(1) pred_boxes3d[:, 0:3] += roi_xyz # calculate corner loss loss_corner = self.get_corner_loss_lidar( pred_boxes3d, pos_gt_bboxes) losses['loss_corner'] = loss_corner return losses def get_targets(self, sampling_results, rcnn_train_cfg, concat=True): """Generate targets. Args: sampling_results (list[:obj:`SamplingResult`]): Sampled results from rois. rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn. concat (bool): Whether to concatenate targets between batches. Returns: tuple[torch.Tensor]: Targets of boxes and class prediction. """ pos_bboxes_list = [res.pos_bboxes for res in sampling_results] pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] iou_list = [res.iou for res in sampling_results] targets = multi_apply( self._get_target_single, pos_bboxes_list, pos_gt_bboxes_list, iou_list, cfg=rcnn_train_cfg) (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) = targets if concat: label = torch.cat(label, 0) bbox_targets = torch.cat(bbox_targets, 0) pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0) reg_mask = torch.cat(reg_mask, 0) label_weights = torch.cat(label_weights, 0) label_weights /= torch.clamp(label_weights.sum(), min=1.0) bbox_weights = torch.cat(bbox_weights, 0) bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): """Generate training targets for a single sample. Args: pos_bboxes (torch.Tensor): Positive boxes with shape (N, 7). pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape (M, 7). ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes` in shape (N, M). cfg (dict): Training configs. Returns: tuple[torch.Tensor]: Target for positive boxes. (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) """ cls_pos_mask = ious > cfg.cls_pos_thr cls_neg_mask = ious < cfg.cls_neg_thr interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0) # iou regression target label = (cls_pos_mask > 0).float() label[interval_mask] = ious[interval_mask] * 2 - 0.5 # label weights label_weights = (label >= 0).float() # box regression target reg_mask = pos_bboxes.new_zeros(ious.size(0)).long() reg_mask[0:pos_gt_bboxes.size(0)] = 1 bbox_weights = (reg_mask > 0).float() if reg_mask.bool().any(): pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach() roi_center = pos_bboxes[..., 0:3] roi_ry = pos_bboxes[..., 6] % (2 * np.pi) # canonical transformation pos_gt_bboxes_ct[..., 0:3] -= roi_center pos_gt_bboxes_ct[..., 6] -= roi_ry pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis( pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry + np.pi / 2), axis=2).squeeze(1) # flip orientation if rois have opposite orientation ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi) # 0 ~ 2pi opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5) ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % ( 2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi) flag = ry_label > np.pi ry_label[flag] = ry_label[flag] - np.pi * 2 # (-pi/2, pi/2) ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2) pos_gt_bboxes_ct[..., 6] = ry_label rois_anchor = pos_bboxes.clone().detach() rois_anchor[:, 0:3] = 0 rois_anchor[:, 6] = 0 bbox_targets = self.bbox_coder.encode(rois_anchor, pos_gt_bboxes_ct) else: # no fg bbox bbox_targets = pos_gt_bboxes.new_empty((0, 7)) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1): """Calculate corner loss of given boxes. Args: pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7). gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7). Returns: torch.FloatTensor: Calculated corner loss in shape (N). """ assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0] # This is a little bit hack here because we assume the box for # Part-A2 is in LiDAR coordinates gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d) pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners gt_box_corners = gt_boxes_structure.corners # This flip only changes the heading direction of GT boxes gt_bbox3d_flip = gt_boxes_structure.clone() gt_bbox3d_flip.tensor[:, 6] += np.pi gt_box_corners_flip = gt_bbox3d_flip.corners corner_dist = torch.min( torch.norm(pred_box_corners - gt_box_corners, dim=2), torch.norm(pred_box_corners - gt_box_corners_flip, dim=2)) # (N, 8) # huber loss abs_error = torch.abs(corner_dist) quadratic = torch.clamp(abs_error, max=delta) linear = (abs_error - quadratic) corner_loss = 0.5 * quadratic**2 + delta * linear return corner_loss.mean(dim=1) def get_bboxes(self, rois, cls_score, bbox_pred, class_labels, class_pred, img_metas, cfg=None): """Generate bboxes from bbox head predictions. Args: rois (torch.Tensor): Roi bounding boxes. cls_score (torch.Tensor): Scores of bounding boxes. bbox_pred (torch.Tensor): Bounding boxes predictions class_labels (torch.Tensor): Label of classes class_pred (torch.Tensor): Score for nms. img_metas (list[dict]): Point cloud and image's meta info. cfg (:obj:`ConfigDict`): Testing config. Returns: list[tuple]: Decoded bbox, scores and labels after nms. """ roi_batch_id = rois[..., 0] roi_boxes = rois[..., 1:] # boxes without batch id batch_size = int(roi_batch_id.max().item() + 1) # decode boxes roi_ry = roi_boxes[..., 6].view(-1) roi_xyz = roi_boxes[..., 0:3].view(-1, 3) local_roi_boxes = roi_boxes.clone().detach() local_roi_boxes[..., 0:3] = 0 rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred) rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis( rcnn_boxes3d[..., 0:3].unsqueeze(1), (roi_ry + np.pi / 2), axis=2).squeeze(1) rcnn_boxes3d[:, 0:3] += roi_xyz # post processing result_list = [] for batch_id in range(batch_size): cur_class_labels = class_labels[batch_id] cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1) cur_box_prob = class_pred[batch_id] cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id] selected = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, cfg.score_thr, cfg.nms_thr, img_metas[batch_id], cfg.use_rotate_nms) selected_bboxes = cur_rcnn_boxes3d[selected] selected_label_preds = cur_class_labels[selected] selected_scores = cur_cls_score[selected] result_list.append( (img_metas[batch_id]['box_type_3d'](selected_bboxes, self.bbox_coder.code_size), selected_scores, selected_label_preds)) return result_list def multi_class_nms(self, box_probs, box_preds, score_thr, nms_thr, input_meta, use_rotate_nms=True): """Multi-class NMS for box head. Note: This function has large overlap with the `box3d_multiclass_nms` implemented in `mmdet3d.core.post_processing`. We are considering merging these two functions in the future. Args: box_probs (torch.Tensor): Predicted boxes probabitilies in shape (N,). box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C). score_thr (float): Threshold of scores. nms_thr (float): Threshold for NMS. input_meta (dict): Meta informations of the current sample. use_rotate_nms (bool, optional): Whether to use rotated nms. Defaults to True. Returns: torch.Tensor: Selected indices. """ if use_rotate_nms: nms_func = nms_gpu else: nms_func = nms_normal_gpu assert box_probs.shape[ 1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}' selected_list = [] selected_labels = [] boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( box_preds, self.bbox_coder.code_size).bev) score_thresh = score_thr if isinstance( score_thr, list) else [score_thr for x in range(self.num_classes)] nms_thresh = nms_thr if isinstance( nms_thr, list) else [nms_thr for x in range(self.num_classes)] for k in range(0, self.num_classes): class_scores_keep = box_probs[:, k] >= score_thresh[k] if class_scores_keep.int().sum() > 0: original_idxs = class_scores_keep.nonzero( as_tuple=False).view(-1) cur_boxes_for_nms = boxes_for_nms[class_scores_keep] cur_rank_scores = box_probs[class_scores_keep, k] cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores, nms_thresh[k]) if cur_selected.shape[0] == 0: continue selected_list.append(original_idxs[cur_selected]) selected_labels.append( torch.full([cur_selected.shape[0]], k + 1, dtype=torch.int64, device=box_preds.device)) selected = torch.cat( selected_list, dim=0) if len(selected_list) > 0 else [] return selected ================================================ FILE: mmdet3d/models/roi_heads/h3d_roi_head.py ================================================ from mmdet3d.core.bbox import bbox3d2result from mmdet.models import HEADS from ..builder import build_head from .base_3droi_head import Base3DRoIHead @HEADS.register_module() class H3DRoIHead(Base3DRoIHead): """H3D roi head for H3DNet. Args: primitive_list (List): Configs of primitive heads. bbox_head (ConfigDict): Config of bbox_head. train_cfg (ConfigDict): Training config. test_cfg (ConfigDict): Testing config. """ def __init__(self, primitive_list, bbox_head=None, train_cfg=None, test_cfg=None): super(H3DRoIHead, self).__init__( bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg) # Primitive module assert len(primitive_list) == 3 self.primitive_z = build_head(primitive_list[0]) self.primitive_xy = build_head(primitive_list[1]) self.primitive_line = build_head(primitive_list[2]) def init_weights(self, pretrained): """Initialize weights, skip since ``H3DROIHead`` does not need to initialize weights.""" pass def init_mask_head(self): """Initialize mask head, skip since ``H3DROIHead`` does not have one.""" pass def init_bbox_head(self, bbox_head): """Initialize box head.""" bbox_head['train_cfg'] = self.train_cfg bbox_head['test_cfg'] = self.test_cfg self.bbox_head = build_head(bbox_head) def init_assigner_sampler(self): """Initialize assigner and sampler.""" pass def forward_train(self, feats_dict, img_metas, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, gt_bboxes_ignore=None): """Training forward function of PartAggregationROIHead. Args: feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Contain pcd and img's meta info. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance mask. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: losses from each head. """ losses = dict() sample_mod = self.train_cfg.sample_mod assert sample_mod in ['vote', 'seed', 'random'] result_z = self.primitive_z(feats_dict, sample_mod) feats_dict.update(result_z) result_xy = self.primitive_xy(feats_dict, sample_mod) feats_dict.update(result_xy) result_line = self.primitive_line(feats_dict, sample_mod) feats_dict.update(result_line) primitive_loss_inputs = (feats_dict, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas, gt_bboxes_ignore) loss_z = self.primitive_z.loss(*primitive_loss_inputs) losses.update(loss_z) loss_xy = self.primitive_xy.loss(*primitive_loss_inputs) losses.update(loss_xy) loss_line = self.primitive_line.loss(*primitive_loss_inputs) losses.update(loss_line) targets = feats_dict.pop('targets') bbox_results = self.bbox_head(feats_dict, sample_mod) feats_dict.update(bbox_results) bbox_loss = self.bbox_head.loss(feats_dict, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas, targets, gt_bboxes_ignore) losses.update(bbox_loss) return losses def simple_test(self, feats_dict, img_metas, points, rescale=False): """Simple testing forward function of PartAggregationROIHead. Note: This function assumes that the batch size is 1 Args: feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Contain pcd and img's meta info. points (torch.Tensor): Input points. rescale (bool): Whether to rescale results. Returns: dict: Bbox results of one frame. """ sample_mod = self.test_cfg.sample_mod assert sample_mod in ['vote', 'seed', 'random'] result_z = self.primitive_z(feats_dict, sample_mod) feats_dict.update(result_z) result_xy = self.primitive_xy(feats_dict, sample_mod) feats_dict.update(result_xy) result_line = self.primitive_line(feats_dict, sample_mod) feats_dict.update(result_line) bbox_preds = self.bbox_head(feats_dict, sample_mod) feats_dict.update(bbox_preds) bbox_list = self.bbox_head.get_bboxes( points, feats_dict, img_metas, rescale=rescale, suffix='_optimized') bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/__init__.py ================================================ from .pointwise_semantic_head import PointwiseSemanticHead from .primitive_head import PrimitiveHead __all__ = ['PointwiseSemanticHead', 'PrimitiveHead'] ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py ================================================ import torch from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.models.builder import build_loss from mmdet.core import multi_apply from mmdet.models import HEADS @HEADS.register_module() class PointwiseSemanticHead(nn.Module): """Semantic segmentation head for point-wise segmentation. Predict point-wise segmentation and part regression results for PartA2. See `paper `_ for more detials. Args: in_channels (int): The number of input channel. num_classes (int): The number of class. extra_width (float): Boxes enlarge width. loss_seg (dict): Config of segmentation loss. loss_part (dict): Config of part prediction loss. """ def __init__(self, in_channels, num_classes=3, extra_width=0.2, seg_score_thr=0.3, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)): super(PointwiseSemanticHead, self).__init__() self.extra_width = extra_width self.num_classes = num_classes self.seg_score_thr = seg_score_thr self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True) self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True) self.loss_seg = build_loss(loss_seg) self.loss_part = build_loss(loss_part) def forward(self, x): """Forward pass. Args: x (torch.Tensor): Features from the first stage. Returns: dict: Part features, segmentation and part predictions. - seg_preds (torch.Tensor): Segment predictions. - part_preds (torch.Tensor): Part predictions. - part_feats (torch.Tensor): Feature predictions. """ seg_preds = self.seg_cls_layer(x) # (N, 1) part_preds = self.seg_reg_layer(x) # (N, 3) seg_scores = torch.sigmoid(seg_preds).detach() seg_mask = (seg_scores > self.seg_score_thr) part_offsets = torch.sigmoid(part_preds).clone().detach() part_offsets[seg_mask.view(-1) == 0] = 0 part_feats = torch.cat((part_offsets, seg_scores), dim=-1) # shape (npoints, 4) return dict( seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats) def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d): """generate segmentation and part prediction targets for a single sample. Args: voxel_centers (torch.Tensor): The center of voxels in shape \ (voxel_num, 3). gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \ shape (box_num, 7). gt_labels_3d (torch.Tensor): Class labels of ground truths in \ shape (box_num). Returns: tuple[torch.Tensor]: Segmentation targets with shape [voxel_num] \ part prediction targets with shape [voxel_num, 3] """ gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device) enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width) part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3), dtype=torch.float32) box_idx = gt_bboxes_3d.points_in_boxes(voxel_centers) enlarge_box_idx = enlarged_gt_boxes.points_in_boxes( voxel_centers).long() gt_labels_pad = F.pad( gt_labels_3d, (1, 0), mode='constant', value=self.num_classes) seg_targets = gt_labels_pad[(box_idx.long() + 1)] fg_pt_flag = box_idx > -1 ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1) seg_targets[ignore_flag] = -1 for k in range(len(gt_bboxes_3d)): k_box_flag = box_idx == k # no point in current box (caused by velodyne reduce) if not k_box_flag.any(): continue fg_voxels = voxel_centers[k_box_flag] transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k] transformed_voxels = rotation_3d_in_axis( transformed_voxels.unsqueeze(0), -gt_bboxes_3d.yaw[k].view(1), axis=2) part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[ k] + voxel_centers.new_tensor([0.5, 0.5, 0]) part_targets = torch.clamp(part_targets, min=0) return seg_targets, part_targets def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d): """generate segmentation and part prediction targets. Args: voxel_centers (torch.Tensor): The center of voxels in shape \ (voxel_num, 3). gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \ shape (box_num, 7). gt_labels_3d (torch.Tensor): Class labels of ground truths in \ shape (box_num). Returns: dict: Prediction targets - seg_targets (torch.Tensor): Segmentation targets \ with shape [voxel_num]. - part_targets (torch.Tensor): Part prediction targets \ with shape [voxel_num, 3]. """ batch_size = len(gt_labels_3d) voxel_center_list = [] for idx in range(batch_size): coords_idx = voxels_dict['coors'][:, 0] == idx voxel_center_list.append(voxels_dict['voxel_centers'][coords_idx]) seg_targets, part_targets = multi_apply(self.get_targets_single, voxel_center_list, gt_bboxes_3d, gt_labels_3d) seg_targets = torch.cat(seg_targets, dim=0) part_targets = torch.cat(part_targets, dim=0) return dict(seg_targets=seg_targets, part_targets=part_targets) def loss(self, semantic_results, semantic_targets): """Calculate point-wise segmentation and part prediction losses. Args: semantic_results (dict): Results from semantic head. - seg_preds: Segmentation predictions. - part_preds: Part predictions. semantic_targets (dict): Targets of semantic results. - seg_preds: Segmentation targets. - part_preds: Part targets. Returns: dict: Loss of segmentation and part prediction. - loss_seg (torch.Tensor): Segmentation prediction loss. - loss_part (torch.Tensor): Part prediction loss. """ seg_preds = semantic_results['seg_preds'] part_preds = semantic_results['part_preds'] seg_targets = semantic_targets['seg_targets'] part_targets = semantic_targets['part_targets'] pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes) binary_seg_target = pos_mask.long() pos = pos_mask.float() neg = (seg_targets == self.num_classes).float() seg_weights = pos + neg pos_normalizer = pos.sum() seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0) loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights) if pos_normalizer > 0: loss_part = self.loss_part(part_preds[pos_mask], part_targets[pos_mask]) else: # fake a part loss loss_part = loss_seg.new_tensor(0) return dict(loss_seg=loss_seg, loss_part=loss_part) ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/primitive_head.py ================================================ import torch from mmcv.cnn import ConvModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.models.builder import build_loss from mmdet3d.models.model_utils import VoteModule from mmdet3d.ops import build_sa_module, furthest_point_sample from mmdet.core import multi_apply from mmdet.models import HEADS @HEADS.register_module() class PrimitiveHead(nn.Module): r"""Primitive head of `H3DNet `_. Args: num_dims (int): The dimension of primitive semantic information. num_classes (int): The number of class. primitive_mode (str): The mode of primitive module, avaliable mode ['z', 'xy', 'line']. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. feat_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. surface_thresh (float): Threshold for suface matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. """ def __init__(self, num_dims, num_classes, primitive_mode, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, feat_channels=(128, 128), upper_thresh=100.0, surface_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, semantic_reg_loss=None, semantic_cls_loss=None): super(PrimitiveHead, self).__init__() assert primitive_mode in ['z', 'xy', 'line'] # The dimension of primitive semantic information. self.num_dims = num_dims self.num_classes = num_classes self.primitive_mode = primitive_mode self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = vote_module_cfg['gt_per_seed'] self.num_proposal = vote_aggregation_cfg['num_point'] self.upper_thresh = upper_thresh self.surface_thresh = surface_thresh self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.semantic_reg_loss = build_loss(semantic_reg_loss) self.semantic_cls_loss = build_loss(semantic_cls_loss) assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[ 'in_channels'] # Primitive existence flag prediction self.flag_conv = ConvModule( vote_module_cfg['conv_channels'][-1], vote_module_cfg['conv_channels'][-1] // 2, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.flag_pred = torch.nn.Conv1d( vote_module_cfg['conv_channels'][-1] // 2, 2, 1) self.vote_module = VoteModule(**vote_module_cfg) self.vote_aggregation = build_sa_module(vote_aggregation_cfg) prev_channel = vote_aggregation_cfg['mlp_channels'][-1] conv_pred_list = list() for k in range(len(feat_channels)): conv_pred_list.append( ConvModule( prev_channel, feat_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) prev_channel = feat_channels[k] self.conv_pred = nn.Sequential(*conv_pred_list) conv_out_channel = 3 + num_dims + num_classes self.conv_pred.add_module('conv_out', nn.Conv1d(prev_channel, conv_out_channel, 1)) def init_weights(self): """Initialize weights of VoteHead.""" pass def forward(self, feats_dict, sample_mod): """Forward pass. Args: feats_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed" and "random". Returns: dict: Predictions of primitive head. """ assert sample_mod in ['vote', 'seed', 'random'] seed_points = feats_dict['fp_xyz_net0'][-1] seed_features = feats_dict['hd_feature'] results = {} primitive_flag = self.flag_conv(seed_features) primitive_flag = self.flag_pred(primitive_flag) results['pred_flag_' + self.primitive_mode] = primitive_flag # 1. generate vote_points from seed_points vote_points, vote_features, _ = self.vote_module( seed_points, seed_features) results['vote_' + self.primitive_mode] = vote_points results['vote_features_' + self.primitive_mode] = vote_features # 2. aggregate vote_points if sample_mod == 'vote': # use fps in vote_aggregation sample_indices = None elif sample_mod == 'seed': # FPS on seed and choose the votes corresponding to the seeds sample_indices = furthest_point_sample(seed_points, self.num_proposal) elif sample_mod == 'random': # Random sampling from the votes batch_size, num_seed = seed_points.shape[:2] sample_indices = torch.randint( 0, num_seed, (batch_size, self.num_proposal), dtype=torch.int32, device=seed_points.device) else: raise NotImplementedError('Unsupported sample mod!') vote_aggregation_ret = self.vote_aggregation(vote_points, vote_features, sample_indices) aggregated_points, features, aggregated_indices = vote_aggregation_ret results['aggregated_points_' + self.primitive_mode] = aggregated_points results['aggregated_features_' + self.primitive_mode] = features results['aggregated_indices_' + self.primitive_mode] = aggregated_indices # 3. predict primitive offsets and semantic information predictions = self.conv_pred(features) # 4. decode predictions decode_ret = self.primitive_decode_scores(predictions, aggregated_points) results.update(decode_ret) center, pred_ind = self.get_primitive_center( primitive_flag, decode_ret['center_' + self.primitive_mode]) results['pred_' + self.primitive_mode + '_ind'] = pred_ind results['pred_' + self.primitive_mode + '_center'] = center return results def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of primitive head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (None | list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of Primitive Head. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask) = targets losses = {} # Compute the loss of primitive existence flag pred_flag = bbox_preds['pred_flag_' + self.primitive_mode] flag_loss = self.objectness_loss(pred_flag, gt_primitive_mask.long()) losses['flag_loss_' + self.primitive_mode] = flag_loss # calculate vote loss vote_loss = self.vote_module.get_loss( bbox_preds['seed_points'], bbox_preds['vote_' + self.primitive_mode], bbox_preds['seed_indices'], point_mask, point_offset) losses['vote_loss_' + self.primitive_mode] = vote_loss num_proposal = bbox_preds['aggregated_points_' + self.primitive_mode].shape[1] primitive_center = bbox_preds['center_' + self.primitive_mode] if self.primitive_mode != 'line': primitive_semantic = bbox_preds['size_residuals_' + self.primitive_mode].contiguous() else: primitive_semantic = None semancitc_scores = bbox_preds['sem_cls_scores_' + self.primitive_mode].transpose(2, 1) gt_primitive_mask = gt_primitive_mask / \ (gt_primitive_mask.sum() + 1e-6) center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss( primitive_center, primitive_semantic, semancitc_scores, num_proposal, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask) losses['center_loss_' + self.primitive_mode] = center_loss losses['size_loss_' + self.primitive_mode] = size_loss losses['sem_loss_' + self.primitive_mode] = sem_cls_loss return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of primitive head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (None | list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (dict): Predictions from forward of primitive head. Returns: tuple[torch.Tensor]: Targets of primitive head. """ for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] (point_mask, point_sem, point_offset) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask) point_mask = torch.stack(point_mask) point_sem = torch.stack(point_sem) point_offset = torch.stack(point_offset) batch_size = point_mask.shape[0] num_proposal = bbox_preds['aggregated_points_' + self.primitive_mode].shape[1] num_seed = bbox_preds['seed_points'].shape[1] seed_inds = bbox_preds['seed_indices'].long() seed_inds_expand = seed_inds.view(batch_size, num_seed, 1).repeat(1, 1, 3) seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand) seed_gt_votes += bbox_preds['seed_points'] gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1, 3) seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat( 1, 1, 4 + self.num_dims) seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem) gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view( batch_size * num_proposal, 1, self.num_dims).contiguous() gt_sem_cls_label = seed_gt_sem[:, :, -1].long() gt_votes_mask = torch.gather(point_mask, 1, seed_inds) return (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None): """Generate targets of primitive head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (None | torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (None | torch.Tensor): Point-wise instance label of each batch. Returns: tuple[torch.Tensor]: Targets of primitive head. """ gt_bboxes_3d = gt_bboxes_3d.to(points.device) num_points = points.shape[0] point_mask = points.new_zeros(num_points) # Offset to the primitive center point_offset = points.new_zeros([num_points, 3]) # Semantic information of primitive center point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1]) # Generate pts_semantic_mask and pts_instance_mask when they are None if pts_semantic_mask is None or pts_instance_mask is None: points2box_mask = gt_bboxes_3d.points_in_boxes(points) assignment = points2box_mask.argmax(1) background_mask = points2box_mask.max(1)[0] == 0 if pts_semantic_mask is None: pts_semantic_mask = gt_labels_3d[assignment] pts_semantic_mask[background_mask] = self.num_classes if pts_instance_mask is None: pts_instance_mask = assignment pts_instance_mask[background_mask] = gt_labels_3d.shape[0] instance_flag = torch.nonzero( pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1) instance_labels = pts_instance_mask[instance_flag].unique() with_yaw = gt_bboxes_3d.with_yaw for i, i_instance in enumerate(instance_labels): indices = instance_flag[pts_instance_mask[instance_flag] == i_instance] coords = points[indices, :3] cur_cls_label = pts_semantic_mask[indices][0] # Bbox Corners cur_corners = gt_bboxes_3d.corners[i] plane_lower_temp = points.new_tensor( [0, 0, 1, -cur_corners[7, -1]]) upper_points = cur_corners[[1, 2, 5, 6]] refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1) if self.check_horizon(upper_points) and \ plane_lower_temp[0] + plane_lower_temp[1] < \ self.train_cfg['lower_thresh']: plane_lower = points.new_tensor( [0, 0, 1, plane_lower_temp[-1]]) plane_upper = points.new_tensor( [0, 0, 1, -torch.mean(refined_distance)]) else: raise NotImplementedError('Only horizontal plane is support!') if self.check_dist(plane_upper, upper_points) is False: raise NotImplementedError( 'Mean distance to plane should be lower than thresh!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_lower, coords) # Get bottom four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='bottom') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching, cur_corners, [1, 1, 0, 0], with_yaw, mode='bottom') # Set the surface labels here if self.primitive_mode == 'z' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='bottom') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_upper, coords) # Get top four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='top') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching, cur_corners, [1, 1, 0, 0], with_yaw, mode='top') if self.primitive_mode == 'z' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='top') # Get left two lines plane_left_temp = self._get_plane_fomulation( cur_corners[2] - cur_corners[3], cur_corners[3] - cur_corners[0], cur_corners[0]) right_points = cur_corners[[4, 5, 7, 6]] plane_left_temp /= torch.norm(plane_left_temp[:3]) refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1) if plane_left_temp[2] < self.train_cfg['lower_thresh']: plane_left = plane_left_temp plane_right = points.new_tensor([ plane_left_temp[0], plane_left_temp[1], plane_left_temp[2], -refined_distance.mean() ]) else: raise NotImplementedError( 'Normal vector of the plane should be horizontal!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_left, coords) # Get left four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='left') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching[2:], cur_corners, [2, 2], with_yaw, mode='left') if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='left') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_right, coords) # Get right four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='right') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching[2:], cur_corners, [2, 2], with_yaw, mode='right') if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='right') plane_front_temp = self._get_plane_fomulation( cur_corners[0] - cur_corners[4], cur_corners[4] - cur_corners[5], cur_corners[5]) back_points = cur_corners[[3, 2, 7, 6]] plane_front_temp /= torch.norm(plane_front_temp[:3]) refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1) if plane_front_temp[2] < self.train_cfg['lower_thresh']: plane_front = plane_front_temp plane_back = points.new_tensor([ plane_front_temp[0], plane_front_temp[1], plane_front_temp[2], -torch.mean(refined_distance) ]) else: raise NotImplementedError( 'Normal vector of the plane should be horizontal!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_front, coords) if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ (point2plane_dist[selected]).var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='front') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_back, coords) if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='back') return (point_mask, point_sem, point_offset) def primitive_decode_scores(self, predictions, aggregated_points): """Decode predicted parts to primitive head. Args: predictions (torch.Tensor): primitive pridictions of each batch. aggregated_points (torch.Tensor): The aggregated points of vote stage. Returns: Dict: Predictions of primitive head, including center, semantic size and semantic scores. """ ret_dict = {} pred_transposed = predictions.transpose(2, 1) center = aggregated_points + pred_transposed[:, :, 0:3] ret_dict['center_' + self.primitive_mode] = center if self.primitive_mode in ['z', 'xy']: ret_dict['size_residuals_' + self.primitive_mode] = \ pred_transposed[:, :, 3:3 + self.num_dims] ret_dict['sem_cls_scores_' + self.primitive_mode] = \ pred_transposed[:, :, 3 + self.num_dims:] return ret_dict def check_horizon(self, points): """Check whether is a horizontal plane. Args: points (torch.Tensor): Points of input. Returns: Bool: Flag of result. """ return (points[0][-1] == points[1][-1]) and \ (points[1][-1] == points[2][-1]) and \ (points[2][-1] == points[3][-1]) def check_dist(self, plane_equ, points): """Whether the mean of points to plane distance is lower than thresh. Args: plane_equ (torch.Tensor): Plane to be checked. points (torch.Tensor): Points to be checked. Returns: Tuple: Flag of result. """ return (points[:, 2] + plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh'] def point2line_dist(self, points, pts_a, pts_b): """Calculate the distance from point to line. Args: points (torch.Tensor): Points of input. pts_a (torch.Tensor): Point on the specific line. pts_b (torch.Tensor): Point on the specific line. Returns: torch.Tensor: Distance between each point to line. """ line_a2b = pts_b - pts_a line_a2pts = points - pts_a length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \ line_a2b.norm() dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt() return dist def match_point2line(self, points, corners, with_yaw, mode='bottom'): """Match points to corresponding line. Args: points (torch.Tensor): Points of input. corners (torch.Tensor): Eight corners of a bounding box. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right'). Defaults to 'bottom'. Returns: Tuple: Flag of matching correspondence. """ if with_yaw: corners_pair = { 'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]], 'top': [[1, 2], [5, 6], [1, 5], [2, 6]], 'left': [[0, 1], [3, 2], [0, 1], [3, 2]], 'right': [[4, 5], [7, 6], [4, 5], [7, 6]] } selected_list = [] for pair_index in corners_pair[mode]: selected = self.point2line_dist( points, corners[pair_index[0]], corners[pair_index[1]]) \ < self.train_cfg['line_thresh'] selected_list.append(selected) else: xmin, ymin, _ = corners.min(0)[0] xmax, ymax, _ = corners.max(0)[0] sel1 = torch.abs(points[:, 0] - xmin) < self.train_cfg['line_thresh'] sel2 = torch.abs(points[:, 0] - xmax) < self.train_cfg['line_thresh'] sel3 = torch.abs(points[:, 1] - ymin) < self.train_cfg['line_thresh'] sel4 = torch.abs(points[:, 1] - ymax) < self.train_cfg['line_thresh'] selected_list = [sel1, sel2, sel3, sel4] return selected_list def match_point2plane(self, plane, points): """Match points to plane. Args: plane (torch.Tensor): Equation of the plane. points (torch.Tensor): Points of input. Returns: Tuple: Distance of each point to the plane and flag of matching correspondence. """ point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) + plane[-1]) min_dist = point2plane_dist.min() selected = torch.abs(point2plane_dist - min_dist) < self.train_cfg['dist_thresh'] return point2plane_dist, selected def compute_primitive_loss(self, primitive_center, primitive_semantic, semantic_scores, num_proposal, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask): """Compute loss of primitive module. Args: primitive_center (torch.Tensor): Pridictions of primitive center. primitive_semantic (torch.Tensor): Pridictions of primitive semantic. semantic_scores (torch.Tensor): Pridictions of primitive semantic scores. num_proposal (int): The number of primitive proposal. gt_primitive_center (torch.Tensor): Ground truth of primitive center. gt_votes_sem (torch.Tensor): Ground truth of primitive semantic. gt_sem_cls_label (torch.Tensor): Ground truth of primitive semantic class. gt_primitive_mask (torch.Tensor): Ground truth of primitive mask. Returns: Tuple: Loss of primitive module. """ batch_size = primitive_center.shape[0] vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1, 3) center_loss = self.center_loss( vote_xyz_reshape, gt_primitive_center, dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1] if self.primitive_mode != 'line': size_xyz_reshape = primitive_semantic.view( batch_size * num_proposal, -1, self.num_dims).contiguous() size_loss = self.semantic_reg_loss( size_xyz_reshape, gt_primitive_semantic, dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1] else: size_loss = center_loss.new_tensor(0.0) # Semantic cls loss sem_cls_loss = self.semantic_cls_loss( semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask) return center_loss, size_loss, sem_cls_loss def get_primitive_center(self, pred_flag, center): """Generate primitive center from predictions. Args: pred_flag (torch.Tensor): Scores of primitive center. center (torch.Tensor): Pridictions of primitive center. Returns: Tuple: Primitive center and the prediction indices. """ ind_normal = F.softmax(pred_flag, dim=1) pred_indices = (ind_normal[:, 1, :] > self.surface_thresh).detach().float() selected = (ind_normal[:, 1, :] <= self.surface_thresh).detach().float() offset = torch.ones_like(center) * self.upper_thresh center = center + offset * selected.unsqueeze(-1) return center, pred_indices def _assign_primitive_line_targets(self, point_mask, point_offset, point_sem, coords, indices, cls_label, point2line_matching, corners, center_axises, with_yaw, mode='bottom'): """Generate targets of line primitive. Args: point_mask (torch.Tensor): Tensor to store the ground truth of mask. point_offset (torch.Tensor): Tensor to store the ground truth of offset. point_sem (torch.Tensor): Tensor to store the ground truth of semantic. coords (torch.Tensor): The selected points. indices (torch.Tensor): Indices of the selected points. cls_label (int): Class label of the ground truth bounding box. point2line_matching (torch.Tensor): Flag indicate that matching line of each point. corners (torch.Tensor): Corners of the ground truth bounding box. center_axises (list[int]): Indicate in which axis the line center should be refined. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right'). Defaults to 'bottom'. Returns: Tuple: Targets of the line primitive. """ corners_pair = { 'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]], 'top': [[1, 2], [5, 6], [1, 5], [2, 6]], 'left': [[0, 1], [3, 2]], 'right': [[4, 5], [7, 6]] } corners_pair = corners_pair[mode] assert len(corners_pair) == len(point2line_matching) == len( center_axises) for line_select, center_axis, pair_index in zip( point2line_matching, center_axises, corners_pair): if line_select.sum() > self.train_cfg['num_point_line']: point_mask[indices[line_select]] = 1.0 if with_yaw: line_center = (corners[pair_index[0]] + corners[pair_index[1]]) / 2 else: line_center = coords[line_select].mean(dim=0) line_center[center_axis] = corners[:, center_axis].mean() point_offset[indices[line_select]] = \ line_center - coords[line_select] point_sem[indices[line_select]] = \ point_sem.new_tensor([line_center[0], line_center[1], line_center[2], cls_label]) return point_mask, point_offset, point_sem def _assign_primitive_surface_targets(self, point_mask, point_offset, point_sem, coords, indices, cls_label, corners, with_yaw, mode='bottom'): """Generate targets for primitive z and primitive xy. Args: point_mask (torch.Tensor): Tensor to store the ground truth of mask. point_offset (torch.Tensor): Tensor to store the ground truth of offset. point_sem (torch.Tensor): Tensor to store the ground truth of semantic. coords (torch.Tensor): The selected points. indices (torch.Tensor): Indices of the selected points. cls_label (int): Class label of the ground truth bounding box. corners (torch.Tensor): Corners of the ground truth bounding box. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right', 'front', 'back'). Defaults to 'bottom'. Returns: Tuple: Targets of the center primitive. """ point_mask[indices] = 1.0 corners_pair = { 'bottom': [0, 7], 'top': [1, 6], 'left': [0, 1], 'right': [4, 5], 'front': [0, 1], 'back': [3, 2] } pair_index = corners_pair[mode] if self.primitive_mode == 'z': if with_yaw: center = (corners[pair_index[0]] + corners[pair_index[1]]) / 2.0 center[2] = coords[:, 2].mean() point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], (corners[4] - corners[0]).norm(), (corners[3] - corners[0]).norm(), cls_label ]) else: center = point_mask.new_tensor([ corners[:, 0].mean(), corners[:, 1].mean(), coords[:, 2].mean() ]) point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[:, 0].max() - corners[:, 0].min(), corners[:, 1].max() - corners[:, 1].min(), cls_label ]) elif self.primitive_mode == 'xy': if with_yaw: center = coords.mean(0) center[2] = (corners[pair_index[0], 2] + corners[pair_index[1], 2]) / 2.0 point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[pair_index[1], 2] - corners[pair_index[0], 2], cls_label ]) else: center = point_mask.new_tensor([ coords[:, 0].mean(), coords[:, 1].mean(), corners[:, 2].mean() ]) point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[:, 2].max() - corners[:, 2].min(), cls_label ]) point_offset[indices] = center - coords return point_mask, point_offset, point_sem def _get_plane_fomulation(self, vector1, vector2, point): """Compute the equation of the plane. Args: vector1 (torch.Tensor): Parallel vector of the plane. vector2 (torch.Tensor): Parallel vector of the plane. point (torch.Tensor): Point on the plane. Returns: torch.Tensor: Equation of the plane. """ surface_norm = torch.cross(vector1, vector2) surface_dis = -torch.dot(surface_norm, point) plane = point.new_tensor( [surface_norm[0], surface_norm[1], surface_norm[2], surface_dis]) return plane ================================================ FILE: mmdet3d/models/roi_heads/part_aggregation_roi_head.py ================================================ from torch.nn import functional as F from mmdet3d.core import AssignResult from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi from mmdet.core import build_assigner, build_sampler from mmdet.models import HEADS from ..builder import build_head, build_roi_extractor from .base_3droi_head import Base3DRoIHead @HEADS.register_module() class PartAggregationROIHead(Base3DRoIHead): """Part aggregation roi head for PartA2. Args: semantic_head (ConfigDict): Config of semantic head. num_classes (int): The number of classes. seg_roi_extractor (ConfigDict): Config of seg_roi_extractor. part_roi_extractor (ConfigDict): Config of part_roi_extractor. bbox_head (ConfigDict): Config of bbox_head. train_cfg (ConfigDict): Training config. test_cfg (ConfigDict): Testing config. """ def __init__(self, semantic_head, num_classes=3, seg_roi_extractor=None, part_roi_extractor=None, bbox_head=None, train_cfg=None, test_cfg=None): super(PartAggregationROIHead, self).__init__( bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg) self.num_classes = num_classes assert semantic_head is not None self.semantic_head = build_head(semantic_head) if seg_roi_extractor is not None: self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor) if part_roi_extractor is not None: self.part_roi_extractor = build_roi_extractor(part_roi_extractor) self.init_assigner_sampler() def init_weights(self, pretrained): """Initialize weights, skip since ``PartAggregationROIHead`` does not need to initialize weights.""" pass def init_mask_head(self): """Initialize mask head, skip since ``PartAggregationROIHead`` does not have one.""" pass def init_bbox_head(self, bbox_head): """Initialize box head.""" self.bbox_head = build_head(bbox_head) def init_assigner_sampler(self): """Initialize assigner and sampler.""" self.bbox_assigner = None self.bbox_sampler = None if self.train_cfg: if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] self.bbox_sampler = build_sampler(self.train_cfg.sampler) @property def with_semantic(self): """bool: whether the head has semantic branch""" return hasattr(self, 'semantic_head') and self.semantic_head is not None def forward_train(self, feats_dict, voxels_dict, img_metas, proposal_list, gt_bboxes_3d, gt_labels_3d): """Training forward function of PartAggregationROIHead. Args: feats_dict (dict): Contains features from the first stage. voxels_dict (dict): Contains information of voxels. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. The dictionary should contain the following keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes - labels_3d (torch.Tensor): Labels of proposals - cls_preds (torch.Tensor): Original scores of proposals gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): GT bboxes of each sample. The bboxes are encapsulated by 3D box structures. gt_labels_3d (list[LongTensor]): GT labels of each sample. Returns: dict: losses from each head. - loss_semantic (torch.Tensor): loss of semantic head - loss_bbox (torch.Tensor): loss of bboxes """ losses = dict() if self.with_semantic: semantic_results = self._semantic_forward_train( feats_dict['seg_features'], voxels_dict, gt_bboxes_3d, gt_labels_3d) losses.update(semantic_results['loss_semantic']) sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d, gt_labels_3d) if self.with_bbox: bbox_results = self._bbox_forward_train( feats_dict['seg_features'], semantic_results['part_feats'], voxels_dict, sample_results) losses.update(bbox_results['loss_bbox']) return losses def simple_test(self, feats_dict, voxels_dict, img_metas, proposal_list, **kwargs): """Simple testing forward function of PartAggregationROIHead. Note: This function assumes that the batch size is 1 Args: feats_dict (dict): Contains features from the first stage. voxels_dict (dict): Contains information of voxels. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. Returns: dict: Bbox results of one frame. """ assert self.with_bbox, 'Bbox head must be implemented.' assert self.with_semantic semantic_results = self.semantic_head(feats_dict['seg_features']) rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list]) labels_3d = [res['labels_3d'] for res in proposal_list] cls_preds = [res['cls_preds'] for res in proposal_list] bbox_results = self._bbox_forward(feats_dict['seg_features'], semantic_results['part_feats'], voxels_dict, rois) bbox_list = self.bbox_head.get_bboxes( rois, bbox_results['cls_score'], bbox_results['bbox_pred'], labels_3d, cls_preds, img_metas, cfg=self.test_cfg) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict, sampling_results): """Forward training function of roi_extractor and bbox_head. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. voxels_dict (dict): Contains information of voxels. sampling_results (:obj:`SamplingResult`): Sampled results used for training. Returns: dict: Forward results including losses and predictions. """ rois = bbox3d2roi([res.bboxes for res in sampling_results]) bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict, rois) bbox_targets = self.bbox_head.get_targets(sampling_results, self.train_cfg) loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_results['bbox_pred'], rois, *bbox_targets) bbox_results.update(loss_bbox=loss_bbox) return bbox_results def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois): """Forward function of roi_extractor and bbox_head used in both training and testing. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. voxels_dict (dict): Contains information of voxels. rois (Tensor): Roi boxes. Returns: dict: Contains predictions of bbox_head and features of roi_extractor. """ pooled_seg_feats = self.seg_roi_extractor(seg_feats, voxels_dict['voxel_centers'], voxels_dict['coors'][..., 0], rois) pooled_part_feats = self.part_roi_extractor( part_feats, voxels_dict['voxel_centers'], voxels_dict['coors'][..., 0], rois) cls_score, bbox_pred = self.bbox_head(pooled_seg_feats, pooled_part_feats) bbox_results = dict( cls_score=cls_score, bbox_pred=bbox_pred, pooled_seg_feats=pooled_seg_feats, pooled_part_feats=pooled_part_feats) return bbox_results def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d): """Assign and sample proposals for training. Args: proposal_list (list[dict]): Proposals produced by RPN. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels_3d (list[torch.Tensor]): Ground truth labels Returns: list[:obj:`SamplingResult`]: Sampled results of each training sample. """ sampling_results = [] # bbox assign for batch_idx in range(len(proposal_list)): cur_proposal_list = proposal_list[batch_idx] cur_boxes = cur_proposal_list['boxes_3d'] cur_labels_3d = cur_proposal_list['labels_3d'] cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device) cur_gt_labels = gt_labels_3d[batch_idx] batch_num_gts = 0 # 0 is bg batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0) batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes)) # -1 is bg batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1) # each class may have its own assigner if isinstance(self.bbox_assigner, list): for i, assigner in enumerate(self.bbox_assigner): gt_per_cls = (cur_gt_labels == i) pred_per_cls = (cur_labels_3d == i) cur_assign_res = assigner.assign( cur_boxes.tensor[pred_per_cls], cur_gt_bboxes.tensor[gt_per_cls], gt_labels=cur_gt_labels[gt_per_cls]) # gather assign_results in different class into one result batch_num_gts += cur_assign_res.num_gts # gt inds (1-based) gt_inds_arange_pad = gt_per_cls.nonzero( as_tuple=False).view(-1) + 1 # pad 0 for indice unassigned gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=0) # pad -1 for indice ignore gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=-1) # convert to 0~gt_num+2 for indices gt_inds_arange_pad += 1 # now 0 is bg, >1 is fg in batch_gt_indis batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[ cur_assign_res.gt_inds + 1] - 1 batch_max_overlaps[ pred_per_cls] = cur_assign_res.max_overlaps batch_gt_labels[pred_per_cls] = cur_assign_res.labels assign_result = AssignResult(batch_num_gts, batch_gt_indis, batch_max_overlaps, batch_gt_labels) else: # for single class assign_result = self.bbox_assigner.assign( cur_boxes.tensor, cur_gt_bboxes.tensor, gt_labels=cur_gt_labels) # sample boxes sampling_result = self.bbox_sampler.sample(assign_result, cur_boxes.tensor, cur_gt_bboxes.tensor, cur_gt_labels) sampling_results.append(sampling_result) return sampling_results def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d, gt_labels_3d): """Train semantic head. Args: x (torch.Tensor): Point-wise semantic features for segmentation voxels_dict (dict): Contains information of voxels. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels_3d (list[torch.Tensor]): Ground truth labels Returns: dict: Segmentation results including losses """ semantic_results = self.semantic_head(x) semantic_targets = self.semantic_head.get_targets( voxels_dict, gt_bboxes_3d, gt_labels_3d) loss_semantic = self.semantic_head.loss(semantic_results, semantic_targets) semantic_results.update(loss_semantic=loss_semantic) return semantic_results ================================================ FILE: mmdet3d/models/roi_heads/roi_extractors/__init__.py ================================================ from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor from .single_roiaware_extractor import Single3DRoIAwareExtractor __all__ = ['SingleRoIExtractor', 'Single3DRoIAwareExtractor'] ================================================ FILE: mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py ================================================ import torch from torch import nn as nn from mmdet3d import ops from mmdet.models.builder import ROI_EXTRACTORS @ROI_EXTRACTORS.register_module() class Single3DRoIAwareExtractor(nn.Module): """Point-wise roi-aware Extractor. Extract Point-wise roi features. Args: roi_layer (dict): The config of roi layer. """ def __init__(self, roi_layer=None): super(Single3DRoIAwareExtractor, self).__init__() self.roi_layer = self.build_roi_layers(roi_layer) def build_roi_layers(self, layer_cfg): """Build roi layers using `layer_cfg`""" cfg = layer_cfg.copy() layer_type = cfg.pop('type') assert hasattr(ops, layer_type) layer_cls = getattr(ops, layer_type) roi_layers = layer_cls(**cfg) return roi_layers def forward(self, feats, coordinate, batch_inds, rois): """Extract point-wise roi features. Args: feats (torch.FloatTensor): Point-wise features with shape (batch, npoints, channels) for pooling. coordinate (torch.FloatTensor): Coordinate of each point. batch_inds (torch.LongTensor): Indicate the batch of each point. rois (torch.FloatTensor): Roi boxes with batch indices. Returns: torch.FloatTensor: Pooled features """ pooled_roi_feats = [] for batch_idx in range(int(batch_inds.max()) + 1): roi_inds = (rois[..., 0].int() == batch_idx) coors_inds = (batch_inds.int() == batch_idx) pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds], coordinate[coors_inds], feats[coors_inds]) pooled_roi_feats.append(pooled_roi_feat) pooled_roi_feats = torch.cat(pooled_roi_feats, 0) return pooled_roi_feats ================================================ FILE: mmdet3d/models/utils/__init__.py ================================================ from .clip_sigmoid import clip_sigmoid from .inverse_sigmoid import inverse_sigmoid from .mlp import MLP from .transformerdecoder import PositionEmbeddingLearned, TransformerDecoderLayer, MultiheadAttention, PositionEmbeddingLearnedwoNorm from .ffn import FFN, FFNLN from .projection import ProjectionLayerNorm from .sparsefusion_models import PointTransformer2D_3D, FusionTransformer2D_3D_Self, ImageTransformer_Cam_3D_MS, ViewTransformer from .drop import Dropout, DropPath, build_dropout from .deformable_decoder import DeformableTransformerDecoderLayer from .depth_encoder import DepthEncoderResNet from .network_modules import LayerNorm, ConvLN, denormalize_pos, normalize_pos __all__ = ['clip_sigmoid', "MLP", 'PositionEmbeddingLearned', 'TransformerDecoderLayer', 'MultiheadAttention', 'FFN', 'inverse_sigmoid', 'PointTransformer2D_3D', 'FFNLN', 'PositionEmbeddingLearnedwoNorm', 'ProjectionLayerNorm', 'FusionTransformer2D_3D_Self', 'Dropout', 'DropPath', 'build_dropout', 'DeformableTransformerDecoderLayer' 'ImageTransformer_Cam_3D_MS', 'ViewTransformer', 'DepthEncoderResNet', 'LayerNorm', 'ConvLN', "normalize_pos", "denormalize_pos" ] ================================================ FILE: mmdet3d/models/utils/clip_sigmoid.py ================================================ import torch def clip_sigmoid(x, eps=1e-4): """Sigmoid function for input feature. Args: x (torch.Tensor): Input feature map with the shape of [B, N, H, W]. eps (float): Lower bound of the range to be clamped to. Defaults to 1e-4. Returns: torch.Tensor: Feature map after sigmoid. """ y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps) return y ================================================ FILE: mmdet3d/models/utils/deformable_decoder.py ================================================ import copy import numpy as np import torch from torch import nn import torch.nn.functional as F from torch.nn import Linear import math import warnings from typing import Optional, no_type_check from torch.autograd.function import Function, once_differentiable from mmdet3d.models.utils import MultiheadAttention from mmcv.runner import BaseModule from mmcv import deprecated_api_warning from mmcv.cnn import constant_init, xavier_init from mmcv.runner import BaseModule from mmdet3d.models.utils.ops.modules import MSDeformAttn class DeformableTransformerDecoderLayer(nn.Module): def __init__(self, d_model, nhead, level_num=4, dim_feedforward=2048, dropout=0.1, activation="relu", self_posembed=None, cross_posembed=None, cross_only=False, n_points=4): super().__init__() self.cross_only = cross_only if not self.cross_only: self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MSDeformAttn(d_model, level_num, nhead, n_points) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") self.activation = _get_activation_fn(activation) self.self_posembed = self_posembed self.cross_posembed = cross_posembed def with_pos_embed(self, tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, query, key, query_pos, key_pos, reference_points, level_start_index, spatial_shapes, query_padding_mask=None, input_padding_mask=None): """ :param query: B C Pq :param key: B C Pk :param query_pos: B Pq 3/6 :param key_pos: B Pk 3/6 :param value_pos: [B Pq 3/6] :return: """ # NxCxP to PxNxC if self.self_posembed is not None: query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1) else: query_pos_embed = None if self.cross_posembed is not None: key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1) else: key_pos_embed = None query = query.permute(2, 0, 1) key = key.permute(2, 0, 1) if not self.cross_only: q = k = v = self.with_pos_embed(query, query_pos_embed) query2 = self.self_attn(q, k, value=v, key_padding_mask=query_padding_mask)[0] query = query + self.dropout1(query2) query = self.norm1(query) query_d = self.with_pos_embed(query, query_pos_embed) input_flatten_d = self.with_pos_embed(key, key_pos_embed) query2 = self.multihead_attn(query=query_d.permute(1, 0, 2), input_flatten=input_flatten_d.permute(1, 0, 2), reference_points=reference_points, input_spatial_shapes=spatial_shapes, input_level_start_index=level_start_index, input_padding_mask=input_padding_mask ) query2 = query2.permute(1, 0, 2) query = query + self.dropout2(query2) query = self.norm2(query) query2 = self.linear2(self.dropout(self.activation(self.linear1(query)))) query = query + self.dropout3(query2) query = self.norm3(query) # NxCxP to PxNxC query = query.permute(1, 2, 0) return query ================================================ FILE: mmdet3d/models/utils/depth_encoder.py ================================================ import torch import torch.nn as nn from mmdet.models.backbones.resnet import BasicBlock from mmdet3d.models.utils.network_modules import LayerNorm from mmcv.cnn import ConvModule class DepthEncoderResNet(nn.Module): def __init__(self, input_channel, input_channel_img, hidden_channel, depth_layers): super().__init__() self.depth_layers = depth_layers self.conv_depth = nn.Sequential( nn.Conv2d(input_channel, hidden_channel, kernel_size=3, padding=1, bias=True), nn.BatchNorm2d(hidden_channel), nn.ReLU(inplace=True) ) self.inplanes = hidden_channel self._norm_layer = nn.BatchNorm2d self.layers = nn.ModuleList() self.fuse_layers = nn.ModuleList() self.output_layers = nn.ModuleList() for i in range(len(depth_layers)): if i == 0: stride = 1 else: stride = 2 self.layers.append(self._make_layer(BasicBlock, hidden_channel, depth_layers[i], stride=stride)) self.fuse_layers.append(nn.Conv2d(input_channel_img+hidden_channel, hidden_channel, kernel_size=3, padding=1)) def _make_layer(self, block, planes, blocks, stride=1): norm_layer = self._norm_layer downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride), norm_layer(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride=stride, downsample=downsample)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, sparse_depth, img_inputs): depth = self.conv_depth(sparse_depth) img_outputs = [] for i in range(len(img_inputs)): depth = self.layers[i](depth) depth = torch.cat([depth, img_inputs[i]], dim=1) depth = self.fuse_layers[i](depth) img_outputs.append(depth.clone()) return img_outputs ================================================ FILE: mmdet3d/models/utils/drop.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Any, Dict, Optional import torch import torch.nn as nn from mmcv import build_from_cfg from mmdet3d.models.registry import DROPOUT_LAYERS def drop_path(x: torch.Tensor, drop_prob: float = 0., training: bool = False) -> torch.Tensor: """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob # handle tensors with different dimensions, not just 4D tensors. shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + torch.rand( shape, dtype=x.dtype, device=x.device) output = x.div(keep_prob) * random_tensor.floor() return output @DROPOUT_LAYERS.register_module() class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 Args: drop_prob (float): Probability of the path to be zeroed. Default: 0.1 """ def __init__(self, drop_prob: float = 0.1): super().__init__() self.drop_prob = drop_prob def forward(self, x: torch.Tensor) -> torch.Tensor: return drop_path(x, self.drop_prob, self.training) @DROPOUT_LAYERS.register_module() class Dropout(nn.Dropout): """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with ``DropPath`` Args: drop_prob (float): Probability of the elements to be zeroed. Default: 0.5. inplace (bool): Do the operation inplace or not. Default: False. """ def __init__(self, drop_prob: float = 0.5, inplace: bool = False): super().__init__(p=drop_prob, inplace=inplace) def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any: """Builder for drop out layers.""" return build_from_cfg(cfg, DROPOUT_LAYERS, default_args) ================================================ FILE: mmdet3d/models/utils/ffn.py ================================================ import copy import numpy as np import torch from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init from mmcv.runner import force_fp32 from torch import nn import torch.nn.functional as F from torch.nn.parameter import Parameter from torch.nn import Linear from torch.nn.init import xavier_uniform_, constant_ class FFN(nn.Module): def __init__(self, in_channels, heads, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), bias='auto', **kwargs): super(FFN, self).__init__() self.heads = heads self.init_bias = init_bias for head in self.heads: if len(self.heads[head]) == 2: classes, num_conv = self.heads[head] need_bn = True else: classes, num_conv, need_bn = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): if need_bn: conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) else: conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=None)) c_in = head_conv conv_layers.append( build_conv_layer( conv_cfg, head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) def init_weights(self): """Initialize weights.""" for head in self.heads: if 'heatmap' in head or 'cls' in head: self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) else: for m in self.__getattr__(head).modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 1, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ ret_dict = dict() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) return ret_dict class FFNLN(nn.Module): def __init__(self, in_channels, heads, head_conv=64, init_bias=-2.19, **kwargs): super(FFNLN, self).__init__() self.heads = heads self.init_bias = init_bias for head in self.heads: if len(self.heads[head]) == 2: classes, num_conv = self.heads[head] need_norm = True else: classes, num_conv, need_norm = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): if need_norm: conv_layers.append( nn.Linear( c_in, head_conv, bias=False, ) ) conv_layers.append(nn.LayerNorm(head_conv)) else: conv_layers.append( nn.Linear( c_in, head_conv, bias=True, ) ) conv_layers.append(nn.ReLU(inplace=True)) c_in = head_conv conv_layers.append( nn.Linear( head_conv, classes, bias=True, ) ) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) def init_weights(self): """Initialize weights.""" for head in self.heads: if 'heatmap' in head or 'cls' in head: self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) else: for m in self.__getattr__(head).modules(): if isinstance(m, nn.Linear): kaiming_init(m) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 1, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ ret_dict = dict() x = x.permute(0, 2, 1).contiguous() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) ret_dict[head] = ret_dict[head].permute(0, 2, 1).contiguous() return ret_dict class FFNReg(nn.Module): def __init__(self, in_channels, heads, head_conv=64, init_bias=-2.19, **kwargs): super(FFNReg, self).__init__() self.heads = heads self.init_bias = init_bias for head in self.heads: classes, num_conv = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): conv_layers.append( nn.Linear( c_in, head_conv, bias=False, ) ) if head == "heatmap" or head == "cls": conv_layers.append(nn.LayerNorm(head_conv)) conv_layers.append(nn.ReLU(inplace=True)) c_in = head_conv conv_layers.append( nn.Linear( head_conv, classes, bias=True, ) ) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) def init_weights(self): """Initialize weights.""" for head in self.heads: if head == 'heatmap' or head == 'cls': self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) else: for m in self.__getattr__(head).modules(): if isinstance(m, nn.Linear): kaiming_init(m) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the \ shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the \ shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape \ of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the \ shape of [B, 1, H, W]. -vel (torch.Tensor): Velocity value with the \ shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of \ [B, N, H, W]. """ ret_dict = dict() x = x.permute(0, 2, 1).contiguous() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) ret_dict[head] = ret_dict[head].permute(0, 2, 1).contiguous() if 'bbox_3d' in ret_dict: ret_dict['center'] = ret_dict['bbox_3d'][:, 0:2] ret_dict['dim'] = ret_dict['bbox_3d'][:, 2:5] ret_dict['height'] = ret_dict['bbox_3d'][:, 5:6] ret_dict['rot'] = ret_dict['bbox_3d'][:, 6:8] ret_dict['vel'] = ret_dict['bbox_3d'][:, 8:10] del ret_dict['bbox_3d'] return ret_dict ================================================ FILE: mmdet3d/models/utils/inverse_sigmoid.py ================================================ import torch def inverse_sigmoid(x, eps=1e-5): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1/x2) ================================================ FILE: mmdet3d/models/utils/mlp.py ================================================ from mmcv.cnn import ConvModule from torch import nn as nn class MLP(nn.Module): """A simple MLP module. Pass features (B, C, N) through an MLP. Args: in_channels (int): Number of channels of input features. Default: 18. conv_channels (tuple[int]): Out channels of the convolution. Default: (256, 256). conv_cfg (dict): Config of convolution. Default: dict(type='Conv1d'). norm_cfg (dict): Config of normalization. Default: dict(type='BN1d'). act_cfg (dict): Config of activation. Default: dict(type='ReLU'). """ def __init__(self, in_channel=18, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU')): super().__init__() self.mlp = nn.Sequential() prev_channels = in_channel for i, conv_channel in enumerate(conv_channels): self.mlp.add_module( f'layer{i}', ConvModule( prev_channels, conv_channels[i], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) prev_channels = conv_channels[i] def forward(self, img_features): return self.mlp(img_features) ================================================ FILE: mmdet3d/models/utils/network_modules.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .inverse_sigmoid import inverse_sigmoid def denormalize_pos(normal_pos, x_max, y_max, sigmoid=True): max_xy = torch.Tensor([x_max, y_max]).to(normal_pos.device).view(1, 1, 2) if sigmoid: pos = normal_pos.sigmoid() * max_xy else: pos = normal_pos * max_xy return pos def normalize_pos(pos, x_max, y_max): max_xy = torch.Tensor([x_max, y_max]).to(pos.device).view(1, 1, 2) normal_pos = pos / max_xy return inverse_sigmoid(normal_pos) class LayerNorm(nn.Module): r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). """ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): super().__init__() self.weight = nn.Parameter(torch.ones(normalized_shape)) self.bias = nn.Parameter(torch.zeros(normalized_shape)) self.eps = eps self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError self.normalized_shape = (normalized_shape,) def forward(self, x): if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) elif self.data_format == "channels_first": u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / torch.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x class ConvLN(nn.Module): def __init__(self, input_channel, hidden_channel, kernel_size=3, stride=1, padding=1, require_act=True): super().__init__() if require_act: self.module = nn.Sequential( nn.Conv2d(input_channel, hidden_channel, kernel_size=kernel_size, stride=stride, padding=padding), LayerNorm(hidden_channel, data_format="channels_first"), nn.ReLU() ) else: self.module = nn.Sequential( nn.Conv2d(input_channel, hidden_channel, kernel_size=kernel_size, stride=stride, padding=padding), LayerNorm(hidden_channel, data_format="channels_first"), ) def forward(self, x): # [bs, C, H, W] x = self.module(x) return x class SE_Block(nn.Module): def __init__(self, c): super().__init__() self.att = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Conv2d(c, c, kernel_size=1, stride=1), nn.Sigmoid() ) def forward(self, x): return x * self.att(x) ================================================ FILE: mmdet3d/models/utils/ops/functions/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from .ms_deform_attn_func import MSDeformAttnFunction __all__ = ['MSDeformAttnFunction'] ================================================ FILE: mmdet3d/models/utils/ops/functions/ms_deform_attn_func.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable import MultiScaleDeformableAttention as MSDA class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): ctx.im2col_step = im2col_step output = MSDA.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable def backward(ctx, grad_output): value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors grad_value, grad_sampling_loc, grad_attn_weight = \ MSDA.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): # for debug and test only, # need to use cuda version instead N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: mmdet3d/models/utils/ops/make.sh ================================================ #!/usr/bin/env bash # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ python setup.py build install ================================================ FILE: mmdet3d/models/utils/ops/modules/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from .ms_deform_attn import MSDeformAttn ================================================ FILE: mmdet3d/models/utils/ops/modules/ms_deform_attn.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import math import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import MSDeformAttnFunction def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class MSDeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ Multi-Scale Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head per feature level """ super().__init__() if d_model % n_heads != 0: raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.im2col_step = 64 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): """ :param query (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) output = MSDeformAttnFunction.apply( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) output = self.output_proj(output) return output ================================================ FILE: mmdet3d/models/utils/ops/setup.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if torch.cuda.is_available() and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: raise NotImplementedError('Cuda is not availabel') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "MultiScaleDeformableAttention", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: mmdet3d/models/utils/ops/src/cpu/ms_deform_attn_cpu.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: mmdet3d/models/utils/ops/src/cpu/ms_deform_attn_cpu.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_attn_cuda.cu ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include "cuda/ms_deform_im2col_cuda.cuh" #include #include #include #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; for (int n = 0; n < batch/im2col_step_; ++n) { auto columns = output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, columns.data()); })); } output = output.view({batch, num_query, num_heads*channels}); return output; } std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto grad_value = at::zeros_like(value); auto grad_sampling_loc = at::zeros_like(sampling_loc); auto grad_attn_weight = at::zeros_like(attn_weight); const int batch_n = im2col_step_; auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), grad_output_g.data(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value.data() + n * im2col_step_ * per_value_size, grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); })); } return { grad_value, grad_sampling_loc, grad_attn_weight }; } ================================================ FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_attn_cuda.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_im2col_cuda.cuh ================================================ /*! ************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************** * Modified from DCN (https://github.com/msracver/Deformable-ConvNets) * Copyright (c) 2018 Microsoft ************************************************************************** */ #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } template __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, const int64_t* data_spatial_shapes, const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* data_col) { const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; const int num_threads = CUDA_NUM_THREADS; ms_deformable_im2col_gpu_kernel <<>>( num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void ms_deformable_col2im_cuda(cudaStream_t stream, const scalar_t* grad_col, const scalar_t* data_value, const int64_t * data_spatial_shapes, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_gm <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } else{ switch(channels) { case 1: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mmdet3d/models/utils/ops/src/ms_deform_attn.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include "cpu/ms_deform_attn_cpu.h" #ifdef WITH_CUDA #include "cuda/ms_deform_attn_cuda.h" #endif at::Tensor ms_deform_attn_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_forward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector ms_deform_attn_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_backward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: mmdet3d/models/utils/ops/src/vision.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include "ms_deform_attn.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); } ================================================ FILE: mmdet3d/models/utils/ops/test.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch N, M, D = 1, 2, 2 Lq, L, P = 2, 2, 2 shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) S = sum([(H*W).item() for H, W in shapes]) torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 func = MSDeformAttnFunction.apply value.requires_grad = grad_value sampling_locations.requires_grad = grad_sampling_loc attention_weights.requires_grad = grad_attn_weight gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) print(f'* {gradok} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) ================================================ FILE: mmdet3d/models/utils/projection.py ================================================ import torch import torch.nn as nn from mmdet3d.models.utils import PositionEmbeddingLearned class PointProjection(nn.Module): def __init__(self, pos_channel, hidden_channel): super(PointProjection, self).__init__() self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1) self.pos_embed = nn.Sequential( nn.Conv1d(pos_channel, hidden_channel*4, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel*4, hidden_channel, kernel_size=1) ) self.fuse_proj = nn.Sequential( nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1) ) def forward(self, query_feat, query_pos): pos_embed = self.pos_embed(query_pos.permute(0, 2, 1)) feat_embed = self.feat_proj(query_feat) proj_embed = self.fuse_proj(feat_embed + pos_embed) return proj_embed class ImageProjection(nn.Module): def __init__(self, pos_channel, hidden_channel): super(ImageProjection, self).__init__() self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1) self.pos_proj = nn.Sequential( nn.Conv1d(pos_channel, hidden_channel*4, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel*4, hidden_channel, kernel_size=1), ) self.fuse_proj = nn.Sequential( nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1) ) def forward(self, query_feat, query_pos): feat_embed = self.feat_proj(query_feat) pos_embed = self.pos_proj(query_pos.permute(0, 2, 1)) proj_embed = self.fuse_proj(feat_embed + pos_embed) return proj_embed class ProjectionL2Norm(nn.Module): def __init__(self, hidden_channel): super(ProjectionL2Norm, self).__init__() self.hidden_channel = hidden_channel self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1) def forward(self, query_feat): query_feat = self.feat_proj(query_feat) assert query_feat.shape[1] == self.hidden_channel query_feat = query_feat / torch.norm(query_feat, p=2, keepdim=True, dim=1) return query_feat class ProjectionLayerNorm(nn.Module): def __init__(self, hidden_channel, norm=True, input_channel=None): super(ProjectionLayerNorm, self).__init__() if input_channel is None: input_channel = hidden_channel self.hidden_channel = hidden_channel self.feat_proj = nn.Linear(input_channel, hidden_channel) self.norm = norm if norm: self.norm = nn.LayerNorm(hidden_channel) def forward(self, query_feat): query_feat = query_feat.transpose(2, 1) query_feat = self.feat_proj(query_feat) if self.norm: query_feat = self.norm(query_feat) query_feat = query_feat.transpose(2, 1) return query_feat class Projection_wPos(nn.Module): def __init__(self, hidden_channel, pos_embed): super(Projection_wPos, self).__init__() self.hidden_channel = hidden_channel self.pos_proj = pos_embed self.feat_proj = ProjectionLayerNorm(hidden_channel) def forward(self, query_feat, query_pos): feat_embed = self.feat_proj(query_feat) pos_embed = self.pos_proj(query_pos) return feat_embed + pos_embed ================================================ FILE: mmdet3d/models/utils/sparsefusion_models.py ================================================ import copy import numpy as np import torch from torch import nn import torch.nn.functional as F from mmdet3d.models.fusion_layers import apply_3d_transformation from mmdet3d.models.utils import TransformerDecoderLayer, inverse_sigmoid from mmdet3d.models.utils.deformable_decoder import DeformableTransformerDecoderLayer from mmdet3d.models.utils.network_modules import LayerNorm, denormalize_pos, normalize_pos class PointTransformer2D_3D(nn.Module): def __init__(self, hidden_channel, num_heads, num_decoder_layers, prediction_heads, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos): super(PointTransformer2D_3D, self).__init__() self.hidden_channel = hidden_channel self.num_heads = num_heads self.num_decoder_layers = num_decoder_layers self.prediction_heads = prediction_heads self.test_cfg = test_cfg self.decoder = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder.append( TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, dropout, activation, self_posembed=query_pos[i], cross_posembed=key_pos[i], ) ) def forward(self, pts_query_feat, pts_query_pos, lidar_feat_flatten, bev_pos): ret_dicts = [] res_layer = self.prediction_heads(pts_query_feat) res_layer['center'] = pts_query_pos.permute(0, 2, 1) # [BS, 2, num_proposals] for i in range(self.num_decoder_layers): # Transformer Decoder Layer # :param query: B C Pq :param query_pos: B Pq 3/6 pts_query_feat = self.decoder[i](pts_query_feat, lidar_feat_flatten, pts_query_pos, bev_pos) # Prediction res_layer = self.prediction_heads(pts_query_feat) res_layer['center'] = res_layer['center'] + pts_query_pos.permute(0, 2, 1) ret_dicts.append(res_layer) # for next level positional embedding pts_query_pos = res_layer['center'].detach().clone().permute(0, 2, 1) return pts_query_feat, pts_query_pos, ret_dicts class CameraSE(nn.Module): def __init__(self, cam_dim, hidden_channel): super(CameraSE, self).__init__() self.bn = nn.BatchNorm1d(cam_dim) self.hidden_channel = hidden_channel self.mlp_depth = nn.Sequential( nn.Conv1d(cam_dim, hidden_channel, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1), nn.ReLU(inplace=True), nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1), ) def forward(self, feat, cam_info): cam_info_bn = self.bn(cam_info) pred = feat * self.mlp_depth(cam_info_bn).sigmoid() return pred class ImageTransformer_Cam_3D_MS(nn.Module): def __init__(self, num_views, hidden_channel, num_heads, num_decoder_layers, prediction_heads, out_size_factor_img, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos): super(ImageTransformer_Cam_3D_MS, self).__init__() self.hidden_channel = hidden_channel self.num_heads = num_heads self.num_decoder_layers = num_decoder_layers self.prediction_heads = prediction_heads self.num_views = num_views self.out_size_factor_img = out_size_factor_img self.test_cfg = test_cfg # self.use_camera = use_camera self.decoder = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder.append( DeformableTransformerDecoderLayer( hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation, self_posembed=query_pos[i], cross_posembed=key_pos[i], ) ) camera_dim = 16 # if use_camera == 'se': # self.camera_net = CameraSE(camera_dim, hidden_channel) def forward(self, img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, lidar2cam_rt, cam_intrinsic, img_metas, input_padding_mask=None): num_img_proposals = img_query_feat.shape[-1] level_num = len(img_feats) batch_size = img_query_feat.shape[0] img_feats_flatten = [] level_start_index = [0] spatial_shapes = [] for lvl in range(level_num): img_feat = img_feats[lvl] h, w = img_feat.shape[-2], img_feat.shape[-1] img_feat_flatten = img_feat.view(batch_size, self.num_views, self.hidden_channel, h*w) # [bs, num_view, C, h*w] img_feats_flatten.append(img_feat_flatten) level_start_index.append(level_start_index[-1] + h*w) spatial_shapes.append([h, w]) level_start_index = level_start_index[:-1] level_start_index = torch.LongTensor(level_start_index).to(img_query_feat.device) spatial_shapes = torch.LongTensor(spatial_shapes).to(img_query_feat.device) img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)] reference_points = normal_img_query_pos.sigmoid() # [bs, num_img_proposal, 2] reference_points = reference_points[:, :, None].repeat(1, 1, level_num, 1) camera_info = torch.zeros([batch_size, 16, num_img_proposals]).to(img_query_feat.device) camera_info[:, :9] = lidar2cam_rt[:, :, :3, :3].permute(0, 2, 3, 1).reshape(batch_size, 9, num_img_proposals) camera_info[:, 9:12] = lidar2cam_rt[:, :, :3, 3].permute(0, 2, 1) camera_info[:, 12] = cam_intrinsic[:, :, 0, 0] camera_info[:, 13] = cam_intrinsic[:, :, 1, 1] camera_info[:, 14:16] = cam_intrinsic[:, :, :2, 2].permute(0, 2, 1) ret_dicts = [] for i in range(self.num_decoder_layers): img_prev_query_feat = img_query_feat.clone() # [BS, C, num_proposals] img_query_feat = torch.zeros_like(img_query_feat) # create new container for img query feature for sample_idx in range(batch_size): bincount = torch.bincount(img_query_view[sample_idx], minlength=self.num_views) view_mask = bincount > 1 max_len = torch.max(bincount) sample_query_feats = torch.zeros([self.num_views, self.hidden_channel, max_len]).type_as(camera_info) samples_normal_query_pos = torch.zeros([self.num_views, max_len, 2]).type_as(camera_info) sample_reference_points = torch.zeros([self.num_views, max_len, level_num, 2]).type_as(camera_info) sample_padding_mask = torch.zeros([self.num_views, max_len], dtype=torch.bool, device=camera_info.device) for view_idx in range(self.num_views): on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ] view_count = bincount[view_idx] if torch.sum(on_the_image) <= 1: continue sample_query_feats[view_idx, :, :view_count] = img_prev_query_feat[sample_idx, :, on_the_image] samples_normal_query_pos[view_idx, :view_count] = normal_img_query_pos[sample_idx, on_the_image] sample_reference_points[view_idx, :view_count] = reference_points[sample_idx, on_the_image] sample_padding_mask[view_idx, view_count:] = True if input_padding_mask is None: sample_query_feats[view_mask] = self.decoder[i]( sample_query_feats[view_mask], img_feats_stack[sample_idx, view_mask], samples_normal_query_pos[view_mask], normal_img_feats_pos_stack.repeat(view_mask.sum(), 1, 1), reference_points=sample_reference_points[view_mask], level_start_index=level_start_index, spatial_shapes=spatial_shapes, query_padding_mask=sample_padding_mask[view_mask] ) else: sample_query_feats[view_mask] = self.decoder[i]( sample_query_feats[view_mask], img_feats_stack[sample_idx, view_mask], samples_normal_query_pos[view_mask], normal_img_feats_pos_stack.repeat(view_mask.sum(), 1, 1), reference_points=sample_reference_points[view_mask], level_start_index=level_start_index, spatial_shapes=spatial_shapes, query_padding_mask=sample_padding_mask[view_mask], input_padding_mask=input_padding_mask[sample_idx,view_mask] ) for view_idx in range(self.num_views): on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ] if torch.sum(on_the_image) <= 1: continue view_count = bincount[view_idx] img_query_feat[sample_idx, :, on_the_image] = sample_query_feats[view_idx, :, :view_count] res_layer = self.prediction_heads(img_query_feat) if 'center_img' in res_layer: res_layer['center_img'] = res_layer['center_img'] + normal_img_query_pos.permute(0, 2, 1) res_layer['center_img'] = res_layer['center_img'].sigmoid() res_layer['dim_img'] = res_layer['dim_img'].sigmoid() res_layer['center_2d'] = res_layer['center_2d'] + normal_img_query_pos.permute(0, 2, 1) normal_img_query_pos = res_layer['center_2d'].detach().clone().permute(0, 2, 1) res_layer['center_2d'] = res_layer['center_2d'].sigmoid() if batch_size > 1 or i == self.num_decoder_layers-1: # only when training center_2d = res_layer['center_2d'].clone().permute(0, 2, 1) # [bs, num_proposals, 2] depth = res_layer['depth_2d'].clone().permute(0, 2, 1)[..., :1] # [bs, num_proposals, 1] h, w = img_metas[0]['input_shape'][:2] center_pos = denormalize_pos(center_2d, w, h, sigmoid=False) # [bs, num_proposals, 2] center_pos = center_pos * depth camera_coords = torch.cat([center_pos, depth], dim=2) # [bs, num_proposals, 3] loc_cam_3d = torch.matmul(torch.inverse(cam_intrinsic[:, :, :3, :3]), camera_coords.unsqueeze(-1)).squeeze(-1) # [bs, num_proposals, 3] res_layer['loc_cam_3d'] = loc_cam_3d.permute(0, 2, 1) ret_dicts.append(res_layer) # img_query_feat = self.camera_net(img_query_feat, camera_info.clone()) loc_cam_3d = copy.deepcopy(ret_dicts[-1]['loc_cam_3d'].detach()).permute(0, 2, 1)[..., None] lidar2cam_r = camera_info[:, :9, :].permute(0, 2, 1) lidar2cam_r = lidar2cam_r.reshape(batch_size, num_img_proposals, 3, 3) lidar2cam_t = camera_info[:, 9:12, :].permute(0, 2, 1)[..., None] bev_coords = torch.matmul(torch.inverse(lidar2cam_r), loc_cam_3d - lidar2cam_t) bev_coords = bev_coords.squeeze(-1) bev_coords[..., 0:1] = (bev_coords[..., 0:1] - self.test_cfg['pc_range'][0]) / ( self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0]) bev_coords[..., 1:2] = (bev_coords[..., 1:2] - self.test_cfg['pc_range'][1]) / ( self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1]) bev_coords[..., 0:1] = bev_coords[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']) bev_coords[..., 1:2] = bev_coords[..., 1:2] * (self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor']) dims, rots, vels = self.transform_bbox(ret_dicts[-1], camera_info, w, img_metas) bev_coords = torch.cat([bev_coords, rots, vels, dims], dim=2) return img_query_feat, normal_img_query_pos, bev_coords, camera_info, ret_dicts def transform_bbox(self, ret_dict, camera_info, width, img_metas): bs = camera_info.shape[0] num_proposal = camera_info.shape[2] lidar2cam_rs = camera_info[:, :9] lidar2cam_rs = lidar2cam_rs.reshape(bs, 3, 3, num_proposal) lidar2cam_rs = lidar2cam_rs.permute(0, 3, 1, 2) # [bs, num_proposals, 3, 3] cam2lidar_rs = torch.inverse(lidar2cam_rs) cam_dims = ret_dict['dim_2d'].detach().clone() # [bs, 3, num_proposals] cam_rots = ret_dict['rot_2d'].detach().clone() # [bs, 2, num_proposals] cam_vels = ret_dict['vel_2d'].detach().clone() # [bs, 2, num_proposals] dims = cam_dims[:, [2, 0, 1]] dims = dims.permute(0, 2, 1) sin_rots = -cam_rots[:, 0:1] cos_rots = cam_rots[:, 1:2] rot_dirs = torch.cat([cos_rots, torch.zeros_like(sin_rots), sin_rots], dim=1) # [bs, 3, num_proposals] rot_dirs = rot_dirs.permute(0, 2, 1).unsqueeze(-1) # [bs, num_proposals, 3, 1] rot_dirs = torch.matmul(cam2lidar_rs, rot_dirs) # [bs, num_proposals, 3, 1] lidar_rots = -rot_dirs[:, :, [0, 1], 0] # [bs, num_proposals, 2] cam_vels_x = cam_vels[:, 0:1, :] cam_vels_z = cam_vels[:, 1:2, :] vels = torch.cat([cam_vels_x, torch.zeros_like(cam_vels_x), cam_vels_z], dim=1) # [bs, 3, num_proposals] vels = vels.permute(0, 2, 1).unsqueeze(-1) # [bs, num_proposals, 3, 1] vels = torch.matmul(cam2lidar_rs, vels) # [bs, num_proposals, 3, 1] lidar_vels = vels[:, :, [0, 1], 0] return dims, lidar_rots, lidar_vels class ViewTransformer(nn.Module): def __init__(self, hidden_channel, num_heads, prediction_heads, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos, view_projection, use_camera): super(ViewTransformer, self).__init__() self.hidden_channel = hidden_channel self.num_heads = num_heads self.prediction_heads = prediction_heads self.test_cfg = test_cfg self.grid_x_size = test_cfg['grid_size'][0] // test_cfg['out_size_factor'] self.grid_y_size = test_cfg['grid_size'][1] // test_cfg['out_size_factor'] self.view_projection = view_projection self.use_camera = use_camera if use_camera is not None: assert use_camera == "se" self.camera_net = CameraSE(16, hidden_channel) self.decoder = TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, activation=activation, dropout=dropout, self_posembed=query_pos, cross_posembed=key_pos, cross_only=True ) def forward(self, img_query_feat, img_query_pos_bev, normal_img_query_pos, img_ret_dicts, camera_info): bs = img_query_feat.shape[0] num_proposals = img_query_feat.shape[-1] center_3d = img_ret_dicts[-1]['loc_cam_3d'].detach().clone().permute(0, 2, 1) # [bs, num_proposal, 3] center_3d = center_3d[:, -num_proposals:] if self.use_camera is not None: img_query_feat = self.camera_net(img_query_feat, camera_info) camera_info = camera_info.permute(0, 2, 1) # [bs, num_proposal, 16] img_query_feat = self.view_projection(img_query_feat) camera_R = camera_info[:, :, :9].reshape(bs, num_proposals, 3, 3) camera_t = camera_info[:, :, 9:12].reshape(bs, num_proposals, 3, 1) camera_t = -torch.matmul(camera_R.permute(0, 1, 3, 2), camera_t).squeeze(-1) camera_t[..., 0:1] = (camera_t[..., 0:1] - self.test_cfg['pc_range'][0]) / ( self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0]) camera_t[..., 1:2] = (camera_t[..., 1:2] - self.test_cfg['pc_range'][1]) / ( self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1]) camera_t[..., 0:1] = camera_t[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']) camera_t[..., 1:2] = camera_t[..., 1:2] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']) img_query_pos = copy.deepcopy(img_query_pos_bev[..., :7]) img_query_pos[..., :2] = inverse_sigmoid((img_query_pos[..., :2] + 12) / 204) img_query_pos[..., 2] = inverse_sigmoid((img_query_pos[..., 2] + 10) / 20) img_query_pos[..., 3:5] = inverse_sigmoid((img_query_pos[..., 3:5] + 1) / 2) img_query_pos = torch.cat([img_query_pos, normal_img_query_pos], dim=2) img_query_feat = self.decoder(img_query_feat, img_query_feat, img_query_pos, img_query_pos) # Prediction res_layer = self.prediction_heads(img_query_feat) res_layer['center_mono'] = img_query_pos_bev[..., 0:2].permute(0, 2, 1) res_layer['height_mono'] = img_query_pos_bev[..., 2:3].permute(0, 2, 1) res_layer['rot_mono'] = img_query_pos_bev[..., 3:5].permute(0, 2, 1) res_layer['vel_mono'] = img_query_pos_bev[..., 5:7].permute(0, 2, 1) res_layer['dim_mono'] = img_query_pos_bev[..., 7:10].permute(0, 2, 1) res_layer['center_view'] = res_layer['center_view'] + img_query_pos_bev[..., 0:2].permute(0, 2, 1) img_query_pos_bev = res_layer['center_view'].detach().clone().permute(0, 2, 1) return img_query_feat, img_query_pos_bev, [res_layer] class FusionTransformer2D_3D_Self(nn.Module): def __init__(self, hidden_channel, num_heads, num_decoder_layers, prediction_heads, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos, pts_projection, img_projection, num_proposals): super(FusionTransformer2D_3D_Self, self).__init__() self.hidden_channel = hidden_channel self.num_heads = num_heads self.num_decoder_layers = num_decoder_layers self.prediction_heads = prediction_heads self.test_cfg = test_cfg self.grid_x_size = test_cfg['grid_size'][0] // test_cfg['out_size_factor'] self.grid_y_size = test_cfg['grid_size'][1] // test_cfg['out_size_factor'] self.pts_projection = pts_projection self.img_projection = img_projection self.num_proposals = num_proposals self.decoder = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder.append( TransformerDecoderLayer( hidden_channel, num_heads, ffn_channel, dropout, activation, self_posembed=query_pos[i], cross_posembed=key_pos[i], cross_only=True ) ) def forward(self, pts_query_feat, pts_query_pos, img_query_feat, img_query_pos, need_weights=False): ret_dicts = [] pts_query_feat = self.pts_projection(pts_query_feat) img_query_feat = self.img_projection(img_query_feat) all_query_feat = torch.cat([pts_query_feat, img_query_feat], dim=2) all_query_pos = torch.cat([pts_query_pos, img_query_pos], dim=1) for i in range(self.num_decoder_layers): # Transformer Decoder Layer # :param query: B C Pq :param query_pos: B Pq 3/6 all_query_feat_raw = all_query_feat.clone() if need_weights: all_query_feat, attn_weights = self.decoder[i](all_query_feat, all_query_feat, all_query_pos, all_query_pos, need_weights=True) else: all_query_feat = self.decoder[i](all_query_feat, all_query_feat, all_query_pos, all_query_pos) all_query_feat_pred = all_query_feat # Prediction res_layer = self.prediction_heads(all_query_feat_pred) res_layer['center'] = res_layer['center'] + all_query_pos.permute(0, 2, 1) ret_dicts.append(res_layer) all_query_pos = res_layer['center'].detach().clone().permute(0, 2, 1) # return all_query_feat, all_query_pos, ret_dicts if need_weights: return all_query_feat, all_query_pos, ret_dicts, attn_weights else: return all_query_feat, all_query_pos, ret_dicts class ImageTransformer2D_3D_MS(nn.Module): def __init__(self, num_views, hidden_channel, num_heads, num_decoder_layers, prediction_heads, out_size_factor_img, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos, supervision2d): super(ImageTransformer2D_3D_MS, self).__init__() self.hidden_channel = hidden_channel self.num_heads = num_heads self.num_decoder_layers = num_decoder_layers self.prediction_heads = prediction_heads self.num_views = num_views self.out_size_factor_img = out_size_factor_img self.test_cfg = test_cfg self.supervision2d = supervision2d self.decoder = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder.append( DeformableTransformerDecoderLayer( hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation, self_posembed=query_pos[i], cross_posembed=key_pos[i], ) ) def forward(self, img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, img_metas): level_num = len(img_feats) batch_size = img_query_feat.shape[0] img_feats_flatten = [] level_start_index = [0] spatial_shapes = [] for lvl in range(level_num): img_feat = img_feats[lvl] h, w = img_feat.shape[-2], img_feat.shape[-1] img_feat_flatten = img_feat.view(batch_size, self.num_views, self.hidden_channel, h*w) # [bs, num_view, C, h*w] img_feats_flatten.append(img_feat_flatten) level_start_index.append(level_start_index[-1] + h*w) spatial_shapes.append([h, w]) level_start_index = level_start_index[:-1] level_start_index = torch.LongTensor(level_start_index).to(img_query_feat.device) spatial_shapes = torch.LongTensor(spatial_shapes).to(img_query_feat.device) img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)] reference_points = normal_img_query_pos.sigmoid() # [bs, num_img_proposal, 2] reference_points = reference_points[:, :, None].repeat(1, 1, level_num, 1) ret_dicts = [] for i in range(self.num_decoder_layers): img_prev_query_feat = img_query_feat.clone() # [BS, C, num_proposals] img_query_feat = torch.zeros_like(img_query_feat) # create new container for img query feature for sample_idx in range(batch_size): for view_idx in range(self.num_views): on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ] if torch.sum(on_the_image) <= 1: continue img_query_feat_view = img_prev_query_feat[sample_idx, :, on_the_image] # [C, num_on_the_image] img_query_feat_view = self.decoder[i]( img_query_feat_view[None], img_feats_stack[sample_idx:sample_idx + 1, view_idx], normal_img_query_pos[sample_idx:sample_idx + 1, on_the_image], normal_img_feats_pos_stack, reference_points=reference_points[sample_idx:sample_idx+1, on_the_image], level_start_index=level_start_index, spatial_shapes=spatial_shapes ) img_query_feat[sample_idx, :, on_the_image] = img_query_feat_view.clone() res_layer = self.prediction_heads(img_query_feat) if 'center_offset' in res_layer: assert 'center_2d' not in res_layer and 'offset' not in res_layer res_layer['center_2d'] = res_layer['center_offset'][:, :2] res_layer['offset'] = res_layer['center_offset'][:, 2:] res_layer['center_2d'] = res_layer['center_2d'] + normal_img_query_pos.permute(0, 2, 1) if self.supervision2d: normal_img_query_pos = res_layer['center_2d'].detach().clone().permute(0, 2, 1) res_layer['center_2d'] = res_layer['center_2d'].sigmoid() res_layer['offset'] = res_layer['offset'].sigmoid() bbox_width = res_layer['offset'][:, 0] + res_layer['offset'][:, 2] bbox_height = res_layer['offset'][:, 1] + res_layer['offset'][:, 3] bbox_cx = (res_layer['center_2d'][:, 0] - res_layer['offset'][:, 0] + res_layer['center_2d'][:, 0] + res_layer['offset'][:, 2]) / 2 bbox_cy = (res_layer['center_2d'][:, 1] - res_layer['offset'][:, 1] + res_layer['center_2d'][:, 1] + res_layer['offset'][:, 3]) / 2 res_layer['bbox_2d'] = torch.stack([bbox_cx, bbox_cy, bbox_width, bbox_height], dim=1).detach().clone() ret_dicts.append(res_layer) return img_query_feat, normal_img_query_pos, ret_dicts def camera2lidar(self, camera_coords, lidar2img, img_meta, batch_size): # img_pos: [W*H, 2] coords = torch.cat([camera_coords, torch.ones_like(camera_coords[..., :1])], dim=1) # [N, 4] img2lidars = torch.inverse(lidar2img) coords3d = torch.matmul(img2lidars, coords.unsqueeze(-1)).squeeze(-1)[..., :3] # [N, 3] if batch_size > 1: coords3d = apply_3d_transformation(coords3d, 'LIDAR', img_meta, reverse=False).detach() coords3d[..., 0:1] = (coords3d[..., 0:1] - self.test_cfg['pc_range'][0]) / ( self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0]) coords3d[..., 1:2] = (coords3d[..., 1:2] - self.test_cfg['pc_range'][1]) / ( self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1]) coords3d[..., 0:1] = coords3d[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']) coords3d[..., 1:2] = coords3d[..., 1:2] * (self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor']) if not self.pos_3d: coords3d = coords3d[..., :2] # [N, 3] if self.pos_3d: coords3d = coords3d.contiguous().view(coords3d.size(0), 3) else: coords3d = coords3d.contiguous().view(coords3d.size(0), 2) return coords3d ================================================ FILE: mmdet3d/models/utils/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import math import warnings import collections from typing import Sequence, Iterable, Optional from itertools import repeat import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, build_norm_layer) from mmcv.runner.base_module import BaseModule from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, TORCH_VERSION, digit_version) from .drop import build_dropout from mmdet3d.models.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.registry import ACTIVATION_LAYERS # From PyTorch internals def _ntuple(n): def parse(x): if isinstance(x, collections.abc.Iterable): return x return tuple(repeat(x, n)) return parse to_2tuple = _ntuple(2) class GELU(nn.Module): r"""Applies the Gaussian Error Linear Units function: .. math:: \text{GELU}(x) = x * \Phi(x) where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. Shape: - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - Output: :math:`(N, *)`, same shape as the input .. image:: scripts/activation_images/GELU.png Examples:: >>> m = nn.GELU() >>> input = torch.randn(2) >>> output = m(input) """ def forward(self, input: torch.Tensor) -> torch.Tensor: return F.gelu(input) if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.4')): ACTIVATION_LAYERS.register_module(module=GELU) else: ACTIVATION_LAYERS.register_module(module=nn.GELU) class ModuleList(BaseModule, nn.ModuleList): """ModuleList in openmmlab. Args: modules (iterable, optional): an iterable of modules to add. init_cfg (dict, optional): Initialization config dict. """ def __init__(self, modules: Optional[Iterable] = None, init_cfg: Optional[dict] = None): BaseModule.__init__(self, init_cfg) nn.ModuleList.__init__(self, modules) class Sequential(BaseModule, nn.Sequential): """Sequential module in openmmlab. Args: init_cfg (dict, optional): Initialization config dict. """ def __init__(self, *args, init_cfg: Optional[dict] = None): BaseModule.__init__(self, init_cfg) nn.Sequential.__init__(self, *args) def build_positional_encoding(cfg, default_args=None): """Builder for Position Encoding.""" return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args) def build_attention(cfg, default_args=None): """Builder for attention.""" return build_from_cfg(cfg, ATTENTION, default_args) def build_feedforward_network(cfg, default_args=None): """Builder for feed-forward network (FFN).""" return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args) def build_transformer_layer(cfg, default_args=None): """Builder for transformer layer.""" return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args) def build_transformer_layer_sequence(cfg, default_args=None): """Builder for transformer encoder and transformer decoder.""" return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args) class AdaptivePadding(nn.Module): """Applies padding adaptively to the input. This module can make input get fully covered by filter you specified. It support two modes "same" and "corner". The "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around input. The "corner" mode would pad zero to bottom right. Args: kernel_size (int | tuple): Size of the kernel. Default: 1. stride (int | tuple): Stride of the filter. Default: 1. dilation (int | tuple): Spacing between kernel elements. Default: 1. padding (str): Support "same" and "corner", "corner" mode would pad zero to bottom right, and "same" mode would pad zero around input. Default: "corner". Example: >>> kernel_size = 16 >>> stride = 16 >>> dilation = 1 >>> input = torch.rand(1, 1, 15, 17) >>> adap_pad = AdaptivePadding( >>> kernel_size=kernel_size, >>> stride=stride, >>> dilation=dilation, >>> padding="corner") >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) >>> input = torch.rand(1, 1, 16, 17) >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) """ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): super().__init__() assert padding in ('same', 'corner') kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) self.padding = padding self.kernel_size = kernel_size self.stride = stride self.dilation = dilation def get_pad_shape(self, input_shape): """Calculate the padding size of input. Args: input_shape (:obj:`torch.Size`): arrange as (H, W). Returns: Tuple[int]: The padding size along the original H and W directions """ input_h, input_w = input_shape kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.stride output_h = math.ceil(input_h / stride_h) output_w = math.ceil(input_w / stride_w) pad_h = max((output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) pad_w = max((output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) return pad_h, pad_w def forward(self, x): """Add padding to `x` Args: x (Tensor): Input tensor has shape (B, C, H, W). Returns: Tensor: The tensor with adaptive padding """ pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) if pad_h > 0 or pad_w > 0: if self.padding == 'corner': x = F.pad(x, [0, pad_w, 0, pad_h]) elif self.padding == 'same': x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return x class PatchEmbed(BaseModule): """Image to Patch Embedding. We use a conv layer to implement PatchEmbed. Args: in_channels (int): The num of input channels. Default: 3 embed_dims (int): The dimensions of embedding. Default: 768 conv_type (str): The type of convolution to generate patch embedding. Default: "Conv2d". kernel_size (int): The kernel_size of embedding conv. Default: 16. stride (int): The slide stride of embedding conv. Default: 16. padding (int | tuple | string): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int): The dilation rate of embedding conv. Default: 1. bias (bool): Bias of embed conv. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. input_size (int | tuple | None): The size of input, which will be used to calculate the out size. Only works when `dynamic_size` is False. Default: None. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. """ def __init__(self, in_channels=3, embed_dims=768, conv_type='Conv2d', kernel_size=16, stride=16, padding='corner', dilation=1, bias=True, norm_cfg=None, input_size=None, init_cfg=None): super().__init__(init_cfg=init_cfg) self.embed_dims = embed_dims if stride is None: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of conv padding = 0 else: self.adaptive_padding = None padding = to_2tuple(padding) self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None if input_size: input_size = to_2tuple(input_size) # `init_out_size` would be used outside to # calculate the num_patches # e.g. when `use_abs_pos_embed` outside self.init_input_size = input_size if self.adaptive_padding: pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size) input_h, input_w = input_size input_h = input_h + pad_h input_w = input_w + pad_w input_size = (input_h, input_w) # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1 w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1 self.init_out_size = (h_out, w_out) else: self.init_input_size = None self.init_out_size = None def forward(self, x): """ Args: x (Tensor): Has shape (B, C, H, W). In most case, C is 3. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, out_h * out_w, embed_dims) - out_size (tuple[int]): Spatial shape of x, arrange as (out_h, out_w). """ if self.adaptive_padding: x = self.adaptive_padding(x) x = self.projection(x) out_size = (x.shape[2], x.shape[3]) x = x.flatten(2).transpose(1, 2) if self.norm is not None: x = self.norm(x) return x, out_size class PatchMerging(BaseModule): """Merge patch feature map. This layer groups feature map by kernel_size, and applies norm and linear layers to the grouped feature map ((used in Swin Transformer)). Our implementation uses `nn.Unfold` to merge patches, which is about 25% faster than the original implementation. However, we need to modify pretrained models for compatibility. Args: in_channels (int): The num of input channels. to gets fully covered by filter and stride you specified. out_channels (int): The num of output channels. kernel_size (int | tuple, optional): the kernel size in the unfold layer. Defaults to 2. stride (int | tuple, optional): the stride of the sliding blocks in the unfold layer. Default: None. (Would be set as `kernel_size`) padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int | tuple, optional): dilation parameter in the unfold layer. Default: 1. bias (bool, optional): Whether to add bias in linear layer or not. Defaults: False. norm_cfg (dict, optional): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, in_channels, out_channels, kernel_size=2, stride=None, padding='corner', dilation=1, bias=False, norm_cfg=dict(type='LN'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels if stride: stride = stride else: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of unfold padding = 0 else: self.adaptive_padding = None padding = to_2tuple(padding) self.sampler = nn.Unfold( kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride) sample_dim = kernel_size[0] * kernel_size[1] * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) def forward(self, x, input_size): """ Args: x (Tensor): Has shape (B, H*W, C_in). input_size (tuple[int]): The spatial shape of x, arrange as (H, W). Default: None. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) - out_size (tuple[int]): Spatial shape of x, arrange as (Merged_H, Merged_W). """ B, L, C = x.shape assert isinstance(input_size, Sequence), f'Expect ' \ f'input_size is ' \ f'`Sequence` ' \ f'but get {input_size}' H, W = input_size assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W if self.adaptive_padding: x = self.adaptive_padding(x) H, W = x.shape[-2:] # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) x = self.sampler(x) out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1) // self.sampler.stride[0] + 1 out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1) // self.sampler.stride[1] + 1 output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C x = self.norm(x) if self.norm else x x = self.reduction(x) return x, output_size @ATTENTION.register_module() class MultiheadAttention(BaseModule): """A wrapper for ``torch.nn.MultiheadAttention``. This module implements MultiheadAttention with identity connection, and positional encoding is also passed as input. Args: embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. attn_drop (float): A Dropout layer on attn_output_weights. Default: 0.0. proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. Default: 0.0. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): When it is True, Key, Query and Value are shape of (batch, n, embed_dim), otherwise (n, batch, embed_dim). Default to False. """ def __init__(self, embed_dims, num_heads, attn_drop=0., proj_drop=0., dropout_layer=dict(type='Dropout', drop_prob=0.), init_cfg=None, batch_first=False, **kwargs): super().__init__(init_cfg) if 'dropout' in kwargs: warnings.warn( 'The arguments `dropout` in MultiheadAttention ' 'has been deprecated, now you can separately ' 'set `attn_drop`(float), proj_drop(float), ' 'and `dropout_layer`(dict) ', DeprecationWarning) attn_drop = kwargs['dropout'] dropout_layer['drop_prob'] = kwargs.pop('dropout') self.embed_dims = embed_dims self.num_heads = num_heads self.batch_first = batch_first self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) self.proj_drop = nn.Dropout(proj_drop) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else nn.Identity() @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiheadAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_pos=None, attn_mask=None, key_padding_mask=None, **kwargs): """Forward function for `MultiheadAttention`. **kwargs allow passing a more general data flow when combining with other operations in `transformerlayer`. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . If None, the ``query`` will be used. Defaults to None. value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. Defaults to None. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, will be used for the identity link. If None, `x` will be used. Defaults to None. query_pos (Tensor): The positional encoding for query, with the same shape as `x`. If not None, it will be added to `x` before forward function. Defaults to None. key_pos (Tensor): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. attn_mask (Tensor): ByteTensor mask with shape [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. """ if key is None: key = query if value is None: value = key if identity is None: identity = query if key_pos is None: if query_pos is not None: # use query_pos if key_pos is not available if query_pos.shape == key.shape: key_pos = query_pos else: warnings.warn(f'position encoding of key is' f'missing in {self.__class__.__name__}.') if query_pos is not None: query = query + query_pos if key_pos is not None: key = key + key_pos # Because the dataflow('key', 'query', 'value') of # ``torch.nn.MultiheadAttention`` is (num_query, batch, # embed_dims), We should adjust the shape of dataflow from # batch_first (batch, num_query, embed_dims) to num_query_first # (num_query ,batch, embed_dims), and recover ``attn_output`` # from num_query_first to batch_first. if self.batch_first: query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) out = self.attn( query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0] if self.batch_first: out = out.transpose(0, 1) return identity + self.dropout_layer(self.proj_drop(out)) @FEEDFORWARD_NETWORK.register_module() class FFN(BaseModule): """Implements feed-forward networks (FFNs) with identity connection. Args: embed_dims (int): The feature dimension. Same as `MultiheadAttention`. Defaults: 256. feedforward_channels (int): The hidden dimension of FFNs. Defaults: 1024. num_fcs (int, optional): The number of fully-connected layers in FFNs. Default: 2. act_cfg (dict, optional): The activation config for FFNs. Default: dict(type='ReLU') ffn_drop (float, optional): Probability of an element to be zeroed in FFN. Default 0.0. add_identity (bool, optional): Whether to add the identity connection. Default: `True`. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ @deprecated_api_warning( { 'dropout': 'ffn_drop', 'add_residual': 'add_identity' }, cls_name='FFN') def __init__(self, embed_dims=256, feedforward_channels=1024, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0., dropout_layer=None, add_identity=True, init_cfg=None, **kwargs): super().__init__(init_cfg) assert num_fcs >= 2, 'num_fcs should be no less ' \ f'than 2. got {num_fcs}.' self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels self.num_fcs = num_fcs self.act_cfg = act_cfg self.activate = build_activation_layer(act_cfg) layers = [] in_channels = embed_dims for _ in range(num_fcs - 1): layers.append( Sequential( Linear(in_channels, feedforward_channels), self.activate, nn.Dropout(ffn_drop))) in_channels = feedforward_channels layers.append(Linear(feedforward_channels, embed_dims)) layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity() self.add_identity = add_identity @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') def forward(self, x, identity=None): """Forward function for `FFN`. The function would add x to the output tensor if residue is None. """ out = self.layers(x) if not self.add_identity: return self.dropout_layer(out) if identity is None: identity = x return identity + self.dropout_layer(out) @TRANSFORMER_LAYER.register_module() class BaseTransformerLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ', DeprecationWarning) ffn_cfgs[new_name] = kwargs[ori_name] super().__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & { 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index], dict(type='FFN'))) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @TRANSFORMER_LAYER_SEQUENCE.register_module() class TransformerLayerSequence(BaseModule): """Base class for TransformerEncoder and TransformerDecoder in vision transformer. As base-class of Encoder and Decoder in vision transformer. Support customization such as specifying different kind of `transformer_layer` in `transformer_coder`. Args: transformerlayer (list[obj:`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict`): Config of transformerlayer in TransformerCoder. If it is obj:`mmcv.ConfigDict`, it would be repeated `num_layer` times to a list[`mmcv.ConfigDict`]. Default: None. num_layers (int): The number of `TransformerLayer`. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): super().__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_layers self.num_layers = num_layers self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) self.embed_dims = self.layers[0].embed_dims self.pre_norm = self.layers[0].pre_norm def forward(self, query, key, value, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerCoder`. Args: query (Tensor): Input query with shape `(num_queries, bs, embed_dims)`. key (Tensor): The key tensor with shape `(num_keys, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_keys, bs, embed_dims)`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor], optional): Each element is 2D Tensor which is used in calculation of corresponding attention in operation_order. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in self-attention Default: None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: results with shape [num_queries, bs, embed_dims]. """ for layer in self.layers: query = layer( query, key, value, query_pos=query_pos, key_pos=key_pos, attn_masks=attn_masks, query_key_padding_mask=query_key_padding_mask, key_padding_mask=key_padding_mask, **kwargs) return query ================================================ FILE: mmdet3d/models/utils/transformerdecoder.py ================================================ import copy import numpy as np import torch from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init from mmcv.runner import force_fp32 from torch import nn import torch.nn.functional as F from torch.nn.parameter import Parameter from torch.nn import Linear from torch.nn.init import xavier_uniform_, constant_ from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr, limit_period, PseudoSampler) from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.core import Box3DMode, LiDARInstance3DBoxes from mmdet3d.models import builder from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.models.utils import clip_sigmoid from mmdet3d.models.fusion_layers import apply_3d_transformation from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult from mmdet3d.ops.roiaware_pool3d import points_in_boxes_batch class PositionEmbeddingLearnedLN(nn.Module): """ Absolute pos embedding, learned. """ def __init__(self, input_channel, num_pos_feats=288): super().__init__() self.position_embedding_head = nn.Sequential( nn.Linear(input_channel, num_pos_feats), nn.ReLU(inplace=True), nn.Linear(num_pos_feats, num_pos_feats), nn.LayerNorm(num_pos_feats), ) def forward(self, xyz): position_embedding = self.position_embedding_head(xyz) position_embedding = position_embedding.transpose(1, 2).contiguous() return position_embedding class PositionEmbeddingLearned(nn.Module): """ Absolute pos embedding, learned. """ def __init__(self, input_channel, num_pos_feats=288): super().__init__() self.position_embedding_head = nn.Sequential( nn.Conv1d(input_channel, num_pos_feats, kernel_size=1), nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True), nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1)) def forward(self, xyz): xyz = xyz.transpose(1, 2).contiguous() position_embedding = self.position_embedding_head(xyz) return position_embedding class PositionEmbeddingLearnedwoNorm(nn.Module): """ Absolute pos embedding, learned. """ def __init__(self, input_channel, num_pos_feats=288): super().__init__() self.position_embedding_head = nn.Sequential( nn.Conv1d(input_channel, num_pos_feats, kernel_size=1), # nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True), nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1)) def forward(self, xyz): xyz = xyz.transpose(1, 2).contiguous() position_embedding = self.position_embedding_head(xyz) return position_embedding class PositionEmbeddingLearnedMulti(nn.Module): """ Absolute pos embedding, learned. """ def __init__(self, input_channel, num_pos_feats=288, pos_num=2): super().__init__() self.position_embedding_heads = nn.ModuleList() self.pos_num = pos_num for i in range(pos_num): self.position_embedding_heads.append(nn.Sequential( nn.Conv1d(input_channel, num_pos_feats, kernel_size=1), nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True), nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1) )) def forward(self, xyzs): output = None for i in range(self.pos_num): xyz = xyzs[i].transpose(1, 2).contiguous() position_embedding = self.position_embedding_heads[i](xyz) if output is None: output = position_embedding else: output = output + position_embedding return output class PositionEmbeddingLearnedMultiInput(nn.Module): def __init__(self, input_channels, num_pos_feats=288): super().__init__() self.position_embedding_heads = nn.ModuleList() self.pos_num = len(input_channels) for i in range(self.pos_num): pos_embed = PositionEmbeddingLearned(input_channels[i], num_pos_feats) self.position_embedding_heads.append(pos_embed) def forward(self, xyzs): output = None assert len(xyzs) == self.pos_num for i in range(self.pos_num): if output is None: output = self.position_embedding_heads[i](xyzs[i]) else: output = output + self.position_embedding_heads[i](xyzs[i]) return output class TransformerDecoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", self_posembed=None, cross_posembed=None, cross_only=False): super().__init__() self.cross_only = cross_only if not self.cross_only: self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") self.activation = _get_activation_fn(activation) self.self_posembed = self_posembed self.cross_posembed = cross_posembed def with_pos_embed(self, tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, query, key, query_pos, key_pos, attn_mask=None, need_weights=False): """ :param query: B C Pq :param key: B C Pk :param query_pos: B Pq 3/6 :param key_pos: B Pk 3/6 :param value_pos: [B Pq 3/6] :return: """ # NxCxP to PxNxC if self.self_posembed is not None: query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1) else: query_pos_embed = None if self.cross_posembed is not None: key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1) else: key_pos_embed = None query = query.permute(2, 0, 1) key = key.permute(2, 0, 1) if not self.cross_only: q = k = v = self.with_pos_embed(query, query_pos_embed) query2 = self.self_attn(q, k, value=v)[0] query = query + self.dropout1(query2) query = self.norm1(query) query2, weights = self.multihead_attn(query=self.with_pos_embed(query, query_pos_embed), key=self.with_pos_embed(key, key_pos_embed), value=self.with_pos_embed(key, key_pos_embed), attn_mask=attn_mask) query = query + self.dropout2(query2) query = self.norm2(query) query2 = self.linear2(self.dropout(self.activation(self.linear1(query)))) query = query + self.dropout3(query2) query = self.norm3(query) # NxCxP to PxNxC query = query.permute(1, 2, 0) if need_weights: return query, weights else: return query class MultiheadAttention(nn.Module): r"""Allows the model to jointly attend to information from different representation subspaces. See reference: Attention Is All You Need .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) Args: embed_dim: total dimension of the model. num_heads: parallel attention heads. dropout: a Dropout layer on attn_output_weights. Default: 0.0. bias: add bias as module parameter. Default: True. add_bias_kv: add bias to the key and value sequences at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. kdim: total number of features in key. Default: None. vdim: total number of features in key. Default: None. Note: if kdim and vdim are None, they will be set to embed_dim such that query, key, and value have the same number of features. Examples:: >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) if self._qkv_same_embed_dim is False: self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) if bias: self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = Linear(embed_dim, embed_dim, bias=bias) if add_bias_kv: self.bias_k = Parameter(torch.empty(1, 1, embed_dim)) self.bias_v = Parameter(torch.empty(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self._reset_parameters() def _reset_parameters(self): if self._qkv_same_embed_dim: xavier_uniform_(self.in_proj_weight) else: xavier_uniform_(self.q_proj_weight) xavier_uniform_(self.k_proj_weight) xavier_uniform_(self.v_proj_weight) if self.in_proj_bias is not None: constant_(self.in_proj_bias, 0.) constant_(self.out_proj.bias, 0.) if self.bias_k is not None: xavier_normal_(self.bias_k) if self.bias_v is not None: xavier_normal_(self.bias_v) def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None): r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer will be filled with -inf. need_weights: output attn_output_weights. attn_mask: mask that prevents attention to certain positions. This is an additive mask (i.e. the values will be added to the attention layer). Shape: - Inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length. - attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - Outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False: return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, v_proj_weight=self.v_proj_weight) else: if not hasattr(self, '_qkv_same_embed_dim'): warnings.warn('A new version of MultiheadAttention module has been implemented. \ Please re-train your model with the new module', UserWarning) return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask) def multi_head_attention_forward(query, # type: Tensor key, # type: Tensor value, # type: Tensor embed_dim_to_check, # type: int num_heads, # type: int in_proj_weight, # type: Tensor in_proj_bias, # type: Tensor bias_k, # type: Optional[Tensor] bias_v, # type: Optional[Tensor] add_zero_attn, # type: bool dropout_p, # type: float out_proj_weight, # type: Tensor out_proj_bias, # type: Tensor training=True, # type: bool key_padding_mask=None, # type: Optional[Tensor] need_weights=True, # type: bool attn_mask=None, # type: Optional[Tensor] use_separate_proj_weight=False, # type: bool q_proj_weight=None, # type: Optional[Tensor] k_proj_weight=None, # type: Optional[Tensor] v_proj_weight=None, # type: Optional[Tensor] static_k=None, # type: Optional[Tensor] static_v=None, # type: Optional[Tensor] ): # type: (...) -> Tuple[Tensor, Optional[Tensor]] r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. embed_dim_to_check: total dimension of the model. num_heads: parallel attention heads. in_proj_weight, in_proj_bias: input projection weight and bias. bias_k, bias_v: bias of the key and value sequences to be added at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. dropout_p: probability of an element to be zeroed. out_proj_weight, out_proj_bias: the output projection weight and bias. training: apply dropout if is ``True``. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer will be filled with -inf. need_weights: output attn_output_weights. attn_mask: mask that prevents attention to certain positions. This is an additive mask (i.e. the values will be added to the attention layer). use_separate_proj_weight: the function accept the proj. weights for query, key, and value in differnt forms. If false, in_proj_weight will be used, which is a combination of q_proj_weight, k_proj_weight, v_proj_weight. q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. static_k, static_v: static key and value used for attention operators. Shape: Inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length. - attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. Outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ qkv_same = torch.equal(query, key) and torch.equal(key, value) kv_same = torch.equal(key, value) tgt_len, bsz, embed_dim = query.size() assert embed_dim == embed_dim_to_check assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 if use_separate_proj_weight is not True: if qkv_same: # self-attention q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1) elif kv_same: # encoder-decoder attention # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = F.linear(query, _w, _b) if key is None: assert value is None k = None v = None else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] k, v = F.linear(key, _w, _b).chunk(2, dim=-1) else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = F.linear(query, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = embed_dim * 2 _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] k = F.linear(key, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim * 2 _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] v = F.linear(value, _w, _b) else: q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight) len1, len2 = q_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == query.size(-1) k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight) len1, len2 = k_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == key.size(-1) v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight) len1, len2 = v_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == value.size(-1) if in_proj_bias is not None: q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim]) k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)]) v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):]) else: q = F.linear(query, q_proj_weight_non_opt, in_proj_bias) k = F.linear(key, k_proj_weight_non_opt, in_proj_bias) v = F.linear(value, v_proj_weight_non_opt, in_proj_bias) q = q * scaling if bias_k is not None and bias_v is not None: if static_k is None and static_v is None: k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat( [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1), dtype=key_padding_mask.dtype, device=key_padding_mask.device)], dim=1) else: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." else: assert bias_k is None assert bias_v is None q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if static_k is not None: assert static_k.size(0) == bsz * num_heads assert static_k.size(2) == head_dim k = static_k if static_v is not None: assert static_v.size(0) == bsz * num_heads assert static_v.size(2) == head_dim v = static_v src_len = k.size(1) if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if add_zero_attn: src_len += 1 k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1) v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: if len(attn_mask.shape) == 2: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1) else: attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), attn_mask.size(1), 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=2) if key_padding_mask is not None: key_padding_mask = torch.cat( [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1), dtype=key_padding_mask.dtype, device=key_padding_mask.device)], dim=1) attn_output_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len] if attn_mask is not None: if len(attn_mask.shape) == 2: attn_mask = attn_mask.unsqueeze(0) else: attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1) attn_mask = attn_mask.reshape(attn_mask.size(0)*num_heads, attn_mask.size(2), attn_mask.size(3)) attn_output_weights += attn_mask if key_padding_mask is not None: attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) attn_output_weights = attn_output_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'), ) attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len) attn_output_weights = F.softmax( attn_output_weights, dim=-1) attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training) attn_output = torch.bmm(attn_output_weights, v) assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None ================================================ FILE: mmdet3d/models/voxel_encoders/__init__.py ================================================ from .pillar_encoder import PillarFeatureNet from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE __all__ = [ 'PillarFeatureNet', 'HardVFE', 'DynamicVFE', 'HardSimpleVFE', 'DynamicSimpleVFE' ] ================================================ FILE: mmdet3d/models/voxel_encoders/pillar_encoder.py ================================================ import torch from mmcv.cnn import build_norm_layer from mmcv.runner import force_fp32 from torch import nn from mmdet3d.ops import DynamicScatter from ..registry import VOXEL_ENCODERS from .utils import PFNLayer, get_paddings_indicator @VOXEL_ENCODERS.register_module() class PillarFeatureNet(nn.Module): """Pillar Feature Net. The network prepares the pillar features and performs forward pass through PFNLayers. Args: in_channels (int, optional): Number of input features, either x, y, z or x, y, z, r. Defaults to 4. feat_channels (tuple, optional): Number of features in each of the N PFNLayers. Defaults to (64, ). with_distance (bool, optional): Whether to include Euclidean distance to points. Defaults to False. with_cluster_center (bool, optional): [description]. Defaults to True. with_voxel_center (bool, optional): [description]. Defaults to True. voxel_size (tuple[float], optional): Size of voxels, only utilize x and y size. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): Point cloud range, only utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg ([type], optional): [description]. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. legacy (bool): Whether to use the new behavior or the original behavior. Defaults to True. """ def __init__(self, in_channels=4, feat_channels=(64, ), with_distance=False, with_cluster_center=True, with_voxel_center=True, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', legacy=True): super(PillarFeatureNet, self).__init__() assert len(feat_channels) > 0 self.legacy = legacy if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 2 if with_distance: in_channels += 1 self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.fp16_enabled = False # Create PillarFeatureNet layers self.in_channels = in_channels feat_channels = [in_channels] + list(feat_channels) pfn_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i < len(feat_channels) - 2: last_layer = False else: last_layer = True pfn_layers.append( PFNLayer( in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer, mode=mode)) self.pfn_layers = nn.ModuleList(pfn_layers) # Need pillar (voxel) size and x/y offset in order to calculate offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.point_cloud_range = point_cloud_range @force_fp32(out_fp16=True) def forward(self, features, num_points, coors): """Forward function. Args: features (torch.Tensor): Point features or raw points in shape (N, M, C). num_points (torch.Tensor): Number of points in each pillar. coors (torch.Tensor): Coordinates of each voxel. Returns: torch.Tensor: Features of pillars. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: points_mean = features[:, :, :3].sum( dim=1, keepdim=True) / num_points.type_as(features).view( -1, 1, 1) f_cluster = features[:, :, :3] - points_mean features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center dtype = features.dtype if self._with_voxel_center: if not self.legacy: f_center = torch.zeros_like(features[:, :, :2]) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) else: f_center = features[:, :, :2] f_center[:, :, 0] = f_center[:, :, 0] - ( coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = f_center[:, :, 1] - ( coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) # The feature decorations were calculated without regard to whether # pillar was empty. Need to ensure that # empty pillars remain set to zeros. voxel_count = features.shape[1] mask = get_paddings_indicator(num_points, voxel_count, axis=0) mask = torch.unsqueeze(mask, -1).type_as(features) features *= mask for pfn in self.pfn_layers: features = pfn(features, num_points) return features.squeeze() @VOXEL_ENCODERS.register_module() class DynamicPillarFeatureNet(PillarFeatureNet): """Pillar Feature Net using dynamic voxelization. The network prepares the pillar features and performs forward pass through PFNLayers. The main difference is that it is used for dynamic voxels, which contains different number of points inside a voxel without limits. Args: in_channels (int, optional): Number of input features, either x, y, z or x, y, z, r. Defaults to 4. feat_channels (tuple, optional): Number of features in each of the N PFNLayers. Defaults to (64, ). with_distance (bool, optional): Whether to include Euclidean distance to points. Defaults to False. with_cluster_center (bool, optional): [description]. Defaults to True. with_voxel_center (bool, optional): [description]. Defaults to True. voxel_size (tuple[float], optional): Size of voxels, only utilize x and y size. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): Point cloud range, only utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg ([type], optional): [description]. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. """ def __init__(self, in_channels=4, feat_channels=(64, ), with_distance=False, with_cluster_center=True, with_voxel_center=True, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max'): super(DynamicPillarFeatureNet, self).__init__( in_channels, feat_channels, with_distance, with_cluster_center=with_cluster_center, with_voxel_center=with_voxel_center, voxel_size=voxel_size, point_cloud_range=point_cloud_range, norm_cfg=norm_cfg, mode=mode) self.fp16_enabled = False feat_channels = [self.in_channels] + list(feat_channels) pfn_layers = [] # TODO: currently only support one PFNLayer for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) pfn_layers.append( nn.Sequential( nn.Linear(in_filters, out_filters, bias=False), norm_layer, nn.ReLU(inplace=True))) self.num_pfn = len(pfn_layers) self.pfn_layers = nn.ModuleList(pfn_layers) self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range, (mode != 'max')) self.cluster_scatter = DynamicScatter( voxel_size, point_cloud_range, average_points=True) def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): """Map the centers of voxels to its corresponding points. Args: pts_coors (torch.Tensor): The coordinates of each points, shape (M, 3), where M is the number of points. voxel_mean (torch.Tensor): The mean or aggreagated features of a voxel, shape (N, C), where N is the number of voxels. voxel_coors (torch.Tensor): The coordinates of each voxel. Returns: torch.Tensor: Corresponding voxel centers of each points, shape (M, C), where M is the numver of points. """ # Step 1: scatter voxel into canvas # Calculate necessary things for canvas creation canvas_y = int( (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) canvas_x = int( (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) canvas_channel = voxel_mean.size(1) batch_size = pts_coors[-1, 0] + 1 canvas_len = canvas_y * canvas_x * batch_size # Create the canvas for this sample canvas = voxel_mean.new_zeros(canvas_channel, canvas_len) # Only include non-empty pillars indices = ( voxel_coors[:, 0] * canvas_y * canvas_x + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) # Scatter the blob back to the canvas canvas[:, indices.long()] = voxel_mean.t() # Step 2: get voxel mean for each point voxel_index = ( pts_coors[:, 0] * canvas_y * canvas_x + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) center_per_point = canvas[:, voxel_index.long()].t() return center_per_point @force_fp32(out_fp16=True) def forward(self, features, coors): """Forward function. Args: features (torch.Tensor): Point features or raw points in shape (N, M, C). coors (torch.Tensor): Coordinates of each voxel Returns: torch.Tensor: Features of pillars. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: voxel_mean, mean_coors = self.cluster_scatter(features, coors) points_mean = self.map_voxel_center_to_point( coors, voxel_mean, mean_coors) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :3] - points_mean[:, :3] features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros(size=(features.size(0), 2)) f_center[:, 0] = features[:, 0] - ( coors[:, 3].type_as(features) * self.vx + self.x_offset) f_center[:, 1] = features[:, 1] - ( coors[:, 2].type_as(features) * self.vy + self.y_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) for i, pfn in enumerate(self.pfn_layers): point_feats = pfn(features) voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors) if i != len(self.pfn_layers) - 1: # need to concat voxel feats if it is not the last pfn feat_per_point = self.map_voxel_center_to_point( coors, voxel_feats, voxel_coors) features = torch.cat([point_feats, feat_per_point], dim=1) return voxel_feats, voxel_coors ================================================ FILE: mmdet3d/models/voxel_encoders/utils.py ================================================ import torch from mmcv.cnn import build_norm_layer from mmcv.runner import auto_fp16 from torch import nn from torch.nn import functional as F def get_paddings_indicator(actual_num, max_num, axis=0): """Create boolean mask by actually number of a padded tensor. Args: actual_num (torch.Tensor): Actual number of points in each voxel. max_num (int): Max number of points in each voxel Returns: torch.Tensor: Mask indicates which points are valid inside a voxel. """ actual_num = torch.unsqueeze(actual_num, axis + 1) # tiled_actual_num: [N, M, 1] max_num_shape = [1] * len(actual_num.shape) max_num_shape[axis + 1] = -1 max_num = torch.arange( max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape) # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]] # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]] paddings_indicator = actual_num.int() > max_num # paddings_indicator shape: [batch_size, max_num] return paddings_indicator class VFELayer(nn.Module): """Voxel Feature Encoder layer. The voxel encoder is composed of a series of these layers. This module do not support average pooling and only support to use max pooling to gather features inside a VFE. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. norm_cfg (dict): Config dict of normalization layers max_out (bool): Whether aggregate the features of points inside each voxel and only return voxel features. cat_max (bool): Whether concatenate the aggregated features and pointwise features. """ def __init__(self, in_channels, out_channels, norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), max_out=True, cat_max=True): super(VFELayer, self).__init__() self.fp16_enabled = False self.cat_max = cat_max self.max_out = max_out # self.units = int(out_channels / 2) self.norm = build_norm_layer(norm_cfg, out_channels)[1] self.linear = nn.Linear(in_channels, out_channels, bias=False) @auto_fp16(apply_to=('inputs'), out_fp32=True) def forward(self, inputs): """Forward function. Args: inputs (torch.Tensor): Voxels features of shape (N, M, C). N is the number of voxels, M is the number of points in voxels, C is the number of channels of point features. Returns: torch.Tensor: Voxel features. There are three mode under which the features have different meaning. - `max_out=False`: Return point-wise features in shape (N, M, C). - `max_out=True` and `cat_max=False`: Return aggregated voxel features in shape (N, C) - `max_out=True` and `cat_max=True`: Return concatenated point-wise features in shape (N, M, C). """ # [K, T, 7] tensordot [7, units] = [K, T, units] voxel_count = inputs.shape[1] x = self.linear(inputs) x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() pointwise = F.relu(x) # [K, T, units] if self.max_out: aggregated = torch.max(pointwise, dim=1, keepdim=True)[0] else: # this is for fusion layer return pointwise if not self.cat_max: return aggregated.squeeze(1) else: # [K, 1, units] repeated = aggregated.repeat(1, voxel_count, 1) concatenated = torch.cat([pointwise, repeated], dim=2) # [K, T, 2 * units] return concatenated class PFNLayer(nn.Module): """Pillar Feature Net Layer. The Pillar Feature Net is composed of a series of these layers, but the PointPillars paper results only used a single PFNLayer. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. norm_cfg (dict): Config dict of normalization layers last_layer (bool): If last_layer, there is no concatenation of features. mode (str): Pooling model to gather features inside voxels. Default to 'max'. """ def __init__(self, in_channels, out_channels, norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), last_layer=False, mode='max'): super().__init__() self.fp16_enabled = False self.name = 'PFNLayer' self.last_vfe = last_layer if not self.last_vfe: out_channels = out_channels // 2 self.units = out_channels self.norm = build_norm_layer(norm_cfg, self.units)[1] self.linear = nn.Linear(in_channels, self.units, bias=False) assert mode in ['max', 'avg'] self.mode = mode @auto_fp16(apply_to=('inputs'), out_fp32=True) def forward(self, inputs, num_voxels=None, aligned_distance=None): """Forward function. Args: inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C). N is the number of voxels, M is the number of points in voxels, C is the number of channels of point features. num_voxels (torch.Tensor, optional): Number of points in each voxel. Defaults to None. aligned_distance (torch.Tensor, optional): The distance of each points to the voxel center. Defaults to None. Returns: torch.Tensor: Features of Pillars. """ x = self.linear(inputs) x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() x = F.relu(x) if self.mode == 'max': if aligned_distance is not None: x = x.mul(aligned_distance.unsqueeze(-1)) x_max = torch.max(x, dim=1, keepdim=True)[0] elif self.mode == 'avg': if aligned_distance is not None: x = x.mul(aligned_distance.unsqueeze(-1)) x_max = x.sum( dim=1, keepdim=True) / num_voxels.type_as(inputs).view( -1, 1, 1) if self.last_vfe: return x_max else: x_repeat = x_max.repeat(1, inputs.shape[1], 1) x_concatenated = torch.cat([x, x_repeat], dim=2) return x_concatenated ================================================ FILE: mmdet3d/models/voxel_encoders/voxel_encoder.py ================================================ import torch from mmcv.cnn import build_norm_layer from mmcv.runner import force_fp32 from torch import nn from mmdet3d.ops import DynamicScatter from .. import builder from ..registry import VOXEL_ENCODERS from .utils import VFELayer, get_paddings_indicator @VOXEL_ENCODERS.register_module() class HardSimpleVFE(nn.Module): """Simple voxel feature encoder used in SECOND. It simply averages the values of points in a voxel. Args: num_features (int): Number of features to use. Default: 4. """ def __init__(self, num_features=4): super(HardSimpleVFE, self).__init__() self.num_features = num_features self.fp16_enabled = False @force_fp32(out_fp16=True) def forward(self, features, num_points, coors): """Forward function. Args: features (torch.Tensor): Point features in shape (N, M, 3(4)). N is the number of voxels and M is the maximum number of points inside a single voxel. num_points (torch.Tensor): Number of points in each voxel, shape (N, ). coors (torch.Tensor): Coordinates of voxels. Returns: torch.Tensor: Mean of points inside each voxel in shape (N, 3(4)) """ points_mean = features[:, :, :self.num_features].sum( dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1) return points_mean.contiguous() @VOXEL_ENCODERS.register_module() class DynamicSimpleVFE(nn.Module): """Simple dynamic voxel feature encoder used in DV-SECOND. It simply averages the values of points in a voxel. But the number of points in a voxel is dynamic and varies. Args: voxel_size (tupe[float]): Size of a single voxel point_cloud_range (tuple[float]): Range of the point cloud and voxels """ def __init__(self, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1)): super(DynamicSimpleVFE, self).__init__() self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) self.fp16_enabled = False @torch.no_grad() @force_fp32(out_fp16=True) def forward(self, features, coors): """Forward function. Args: features (torch.Tensor): Point features in shape (N, 3(4)). N is the number of points. coors (torch.Tensor): Coordinates of voxels. Returns: torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)). M is the number of voxels. """ # This function is used from the start of the voxelnet # num_points: [concated_num_points] features, features_coors = self.scatter(features, coors) return features, features_coors @VOXEL_ENCODERS.register_module() class DynamicVFE(nn.Module): """Dynamic Voxel feature encoder used in DV-SECOND. It encodes features of voxels and their points. It could also fuse image feature into voxel features in a point-wise manner. The number of points inside the voxel varies. Args: in_channels (int): Input channels of VFE. Defaults to 4. feat_channels (list(int)): Channels of features in VFE. with_distance (bool): Whether to use the L2 distance of points to the origin point. Default False. with_cluster_center (bool): Whether to use the distance to cluster center of points inside a voxel. Default to False. with_voxel_center (bool): Whether to use the distance to center of voxel for each points inside a voxel. Default to False. voxel_size (tuple[float]): Size of a single voxel. Default to (0.2, 0.2, 4). point_cloud_range (tuple[float]): The range of points or voxels. Default to (0, -40, -3, 70.4, 40, 1). norm_cfg (dict): Config dict of normalization layers. mode (str): The mode when pooling features of points inside a voxel. Available options include 'max' and 'avg'. Default to 'max'. fusion_layer (dict | None): The config dict of fusion layer used in multi-modal detectors. Default to None. return_point_feats (bool): Whether to return the features of each points. Default to False. """ def __init__(self, in_channels=4, feat_channels=[], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', fusion_layer=None, return_point_feats=False): super(DynamicVFE, self).__init__() assert mode in ['avg', 'max'] assert len(feat_channels) > 0 if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 3 if with_distance: in_channels += 3 self.in_channels = in_channels self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.return_point_feats = return_point_feats self.fp16_enabled = False # Need pillar (voxel) size and x/y offset in order to calculate offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) feat_channels = [self.in_channels] + list(feat_channels) vfe_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) vfe_layers.append( nn.Sequential( nn.Linear(in_filters, out_filters, bias=False), norm_layer, nn.ReLU(inplace=True))) self.vfe_layers = nn.ModuleList(vfe_layers) self.num_vfe = len(vfe_layers) self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range, (mode != 'max')) self.cluster_scatter = DynamicScatter( voxel_size, point_cloud_range, average_points=True) self.fusion_layer = None if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): """Map voxel features to its corresponding points. Args: pts_coors (torch.Tensor): Voxel coordinate of each point. voxel_mean (torch.Tensor): Voxel features to be mapped. voxel_coors (torch.Tensor): Coordinates of valid voxels Returns: torch.Tensor: Features or centers of each point. """ # Step 1: scatter voxel into canvas # Calculate necessary things for canvas creation canvas_z = int( (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz) canvas_y = int( (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) canvas_x = int( (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) # canvas_channel = voxel_mean.size(1) batch_size = pts_coors[-1, 0] + 1 canvas_len = canvas_z * canvas_y * canvas_x * batch_size # Create the canvas for this sample canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long) # Only include non-empty pillars indices = ( voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x + voxel_coors[:, 1] * canvas_y * canvas_x + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) # Scatter the blob back to the canvas canvas[indices.long()] = torch.arange( start=0, end=voxel_mean.size(0), device=voxel_mean.device) # Step 2: get voxel mean for each point voxel_index = ( pts_coors[:, 0] * canvas_z * canvas_y * canvas_x + pts_coors[:, 1] * canvas_y * canvas_x + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) voxel_inds = canvas[voxel_index.long()] center_per_point = voxel_mean[voxel_inds, ...] return center_per_point @force_fp32(out_fp16=True) def forward(self, features, coors, points=None, img_feats=None, img_metas=None): """Forward functions. Args: features (torch.Tensor): Features of voxels, shape is NxC. coors (torch.Tensor): Coordinates of voxels, shape is Nx(1+NDim). points (list[torch.Tensor], optional): Raw points used to guide the multi-modality fusion. Defaults to None. img_feats (list[torch.Tensor], optional): Image fetures used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. Returns: tuple: If `return_point_feats` is False, returns voxel features and its coordinates. If `return_point_feats` is True, returns feature of each points inside voxels. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: voxel_mean, mean_coors = self.cluster_scatter(features, coors) points_mean = self.map_voxel_center_to_point( coors, voxel_mean, mean_coors) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :3] - points_mean[:, :3] features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros(size=(features.size(0), 3)) f_center[:, 0] = features[:, 0] - ( coors[:, 3].type_as(features) * self.vx + self.x_offset) f_center[:, 1] = features[:, 1] - ( coors[:, 2].type_as(features) * self.vy + self.y_offset) f_center[:, 2] = features[:, 2] - ( coors[:, 1].type_as(features) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) for i, vfe in enumerate(self.vfe_layers): point_feats = vfe(features) if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None and img_feats is not None): point_feats = self.fusion_layer(img_feats, points, point_feats, img_metas) voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors) if i != len(self.vfe_layers) - 1: # need to concat voxel feats if it is not the last vfe feat_per_point = self.map_voxel_center_to_point( coors, voxel_feats, voxel_coors) features = torch.cat([point_feats, feat_per_point], dim=1) if self.return_point_feats: return point_feats return voxel_feats, voxel_coors @VOXEL_ENCODERS.register_module() class HardVFE(nn.Module): """Voxel feature encoder used in DV-SECOND. It encodes features of voxels and their points. It could also fuse image feature into voxel features in a point-wise manner. Args: in_channels (int): Input channels of VFE. Defaults to 4. feat_channels (list(int)): Channels of features in VFE. with_distance (bool): Whether to use the L2 distance of points to the origin point. Default False. with_cluster_center (bool): Whether to use the distance to cluster center of points inside a voxel. Default to False. with_voxel_center (bool): Whether to use the distance to center of voxel for each points inside a voxel. Default to False. voxel_size (tuple[float]): Size of a single voxel. Default to (0.2, 0.2, 4). point_cloud_range (tuple[float]): The range of points or voxels. Default to (0, -40, -3, 70.4, 40, 1). norm_cfg (dict): Config dict of normalization layers. mode (str): The mode when pooling features of points inside a voxel. Available options include 'max' and 'avg'. Default to 'max'. fusion_layer (dict | None): The config dict of fusion layer used in multi-modal detectors. Default to None. return_point_feats (bool): Whether to return the features of each points. Default to False. """ def __init__(self, in_channels=4, feat_channels=[], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', fusion_layer=None, return_point_feats=False): super(HardVFE, self).__init__() assert len(feat_channels) > 0 if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 3 if with_distance: in_channels += 3 self.in_channels = in_channels self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.return_point_feats = return_point_feats self.fp16_enabled = False # Need pillar (voxel) size and x/y offset to calculate pillar offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) feat_channels = [self.in_channels] + list(feat_channels) vfe_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 # TODO: pass norm_cfg to VFE # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) if i == (len(feat_channels) - 2): cat_max = False max_out = True if fusion_layer: max_out = False else: max_out = True cat_max = True vfe_layers.append( VFELayer( in_filters, out_filters, norm_cfg=norm_cfg, max_out=max_out, cat_max=cat_max)) self.vfe_layers = nn.ModuleList(vfe_layers) self.num_vfe = len(vfe_layers) self.fusion_layer = None if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) @force_fp32(out_fp16=True) def forward(self, features, num_points, coors, img_feats=None, img_metas=None): """Forward functions. Args: features (torch.Tensor): Features of voxels, shape is MxNxC. num_points (torch.Tensor): Number of points in each voxel. coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim). img_feats (list[torch.Tensor], optional): Image fetures used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. Returns: tuple: If `return_point_feats` is False, returns voxel features and its coordinates. If `return_point_feats` is True, returns feature of each points inside voxels. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: points_mean = ( features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1)) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :, :3] - points_mean features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros( size=(features.size(0), features.size(1), 3)) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset) f_center[:, :, 2] = features[:, :, 2] - ( coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations voxel_feats = torch.cat(features_ls, dim=-1) # The feature decorations were calculated without regard to whether # pillar was empty. # Need to ensure that empty voxels remain set to zeros. voxel_count = voxel_feats.shape[1] mask = get_paddings_indicator(num_points, voxel_count, axis=0) voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats) for i, vfe in enumerate(self.vfe_layers): voxel_feats = vfe(voxel_feats) if (self.fusion_layer is not None and img_feats is not None): voxel_feats = self.fusion_with_mask(features, mask, voxel_feats, coors, img_feats, img_metas) return voxel_feats def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats, img_metas): """Fuse image and point features with mask. Args: features (torch.Tensor): Features of voxel, usually it is the values of points in voxels. mask (torch.Tensor): Mask indicates valid features in each voxel. voxel_feats (torch.Tensor): Features of voxels. coors (torch.Tensor): Coordinates of each single voxel. img_feats (list[torch.Tensor]): Multi-scale feature maps of image. img_metas (list(dict)): Meta information of image and points. Returns: torch.Tensor: Fused features of each voxel. """ # the features is consist of a batch of points batch_size = coors[-1, 0] + 1 points = [] for i in range(batch_size): single_mask = (coors[:, 0] == i) points.append(features[single_mask][mask[single_mask]]) point_feats = voxel_feats[mask] point_feats = self.fusion_layer(img_feats, points, point_feats, img_metas) voxel_canvas = voxel_feats.new_zeros( size=(voxel_feats.size(0), voxel_feats.size(1), point_feats.size(-1))) voxel_canvas[mask] = point_feats out = torch.max(voxel_canvas, dim=1)[0] return out ================================================ FILE: mmdet3d/ops/__init__.py ================================================ from mmcv.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version, get_compiling_cuda_version, nms, roi_align, sigmoid_focal_loss) from .ball_query import ball_query from .furthest_point_sample import (Points_Sampler, furthest_point_sample, furthest_point_sample_with_dist) from .gather_points import gather_points from .group_points import (GroupAll, QueryAndGroup, group_points, grouping_operation) from .interpolate import three_interpolate, three_nn from .knn import knn from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d from .pointnet_modules import (PointFPModule, PointSAModule, PointSAModuleMSG, build_sa_module) from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch, points_in_boxes_cpu, points_in_boxes_gpu) from .sparse_block import (SparseBasicBlock, SparseBottleneck, make_sparse_convmodule) from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization __all__ = [ 'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version', 'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d', 'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck', 'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu', 'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample', 'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn', 'gather_points', 'grouping_operation', 'group_points', 'GroupAll', 'QueryAndGroup', 'PointSAModule', 'PointSAModuleMSG', 'PointFPModule', 'points_in_boxes_batch', 'get_compiler_version', 'get_compiling_cuda_version', 'Points_Sampler', 'build_sa_module' ] ================================================ FILE: mmdet3d/ops/ball_query/__init__.py ================================================ from .ball_query import ball_query __all__ = ['ball_query'] ================================================ FILE: mmdet3d/ops/ball_query/ball_query.py ================================================ import torch from torch.autograd import Function from . import ball_query_ext class BallQuery(Function): """Ball Query. Find nearby points in spherical space. """ @staticmethod def forward(ctx, min_radius: float, max_radius: float, sample_num: int, xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor: """forward. Args: min_radius (float): minimum radius of the balls. max_radius (float): maximum radius of the balls. sample_num (int): maximum number of features in the balls. xyz (Tensor): (B, N, 3) xyz coordinates of the features. center_xyz (Tensor): (B, npoint, 3) centers of the ball query. Returns: Tensor: (B, npoint, nsample) tensor with the indicies of the features that form the query balls. """ assert center_xyz.is_contiguous() assert xyz.is_contiguous() assert min_radius < max_radius B, N, _ = xyz.size() npoint = center_xyz.size(1) idx = torch.cuda.IntTensor(B, npoint, sample_num).zero_() ball_query_ext.ball_query_wrapper(B, N, npoint, min_radius, max_radius, sample_num, center_xyz, xyz, idx) ctx.mark_non_differentiable(idx) return idx @staticmethod def backward(ctx, a=None): return None, None, None, None ball_query = BallQuery.apply ================================================ FILE: mmdet3d/ops/ball_query/src/ball_query.cpp ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp #include #include #include #include #include #include extern THCState *state; #define CHECK_CUDA(x) \ TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) int ball_query_wrapper(int b, int n, int m, float min_radius, float max_radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); void ball_query_kernel_launcher(int b, int n, int m, float min_radius, float max_radius, int nsample, const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream); int ball_query_wrapper(int b, int n, int m, float min_radius, float max_radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) { CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(xyz_tensor); const float *new_xyz = new_xyz_tensor.data_ptr(); const float *xyz = xyz_tensor.data_ptr(); int *idx = idx_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); ball_query_kernel_launcher(b, n, m, min_radius, max_radius, nsample, new_xyz, xyz, idx, stream); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper"); } ================================================ FILE: mmdet3d/ops/ball_query/src/ball_query_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu #include #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __global__ void ball_query_kernel(int b, int n, int m, float min_radius, float max_radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { // new_xyz: (B, M, 3) // xyz: (B, N, 3) // output: // idx: (B, M, nsample) int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; float max_radius2 = max_radius * max_radius; float min_radius2 = min_radius * min_radius; float new_x = new_xyz[0]; float new_y = new_xyz[1]; float new_z = new_xyz[2]; int cnt = 0; for (int k = 0; k < n; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { idx[l] = k; } } idx[cnt] = k; ++cnt; if (cnt >= nsample) break; } } } void ball_query_kernel_launcher(int b, int n, int m, float min_radius, float max_radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) { // new_xyz: (B, M, 3) // xyz: (B, N, 3) // output: // idx: (B, M, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); ball_query_kernel<<>>(b, n, m, min_radius, max_radius, nsample, new_xyz, xyz, idx); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/furthest_point_sample/__init__.py ================================================ from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from .points_sampler import Points_Sampler __all__ = [ 'furthest_point_sample', 'furthest_point_sample_with_dist', 'Points_Sampler' ] ================================================ FILE: mmdet3d/ops/furthest_point_sample/furthest_point_sample.py ================================================ import torch from torch.autograd import Function from . import furthest_point_sample_ext class FurthestPointSampling(Function): """Furthest Point Sampling. Uses iterative furthest point sampling to select a set of features whose corresponding points have the furthest distance. """ @staticmethod def forward(ctx, points_xyz: torch.Tensor, num_points: int) -> torch.Tensor: """forward. Args: points_xyz (Tensor): (B, N, 3) where N > num_points. num_points (int): Number of points in the sampled set. Returns: Tensor: (B, num_points) indices of the sampled points. """ assert points_xyz.is_contiguous() B, N = points_xyz.size()[:2] output = torch.cuda.IntTensor(B, num_points) temp = torch.cuda.FloatTensor(B, N).fill_(1e10) furthest_point_sample_ext.furthest_point_sampling_wrapper( B, N, num_points, points_xyz, temp, output) ctx.mark_non_differentiable(output) return output @staticmethod def backward(xyz, a=None): return None, None class FurthestPointSamplingWithDist(Function): """Furthest Point Sampling With Distance. Uses iterative furthest point sampling to select a set of features whose corresponding points have the furthest distance. """ @staticmethod def forward(ctx, points_dist: torch.Tensor, num_points: int) -> torch.Tensor: """forward. Args: points_dist (Tensor): (B, N, N) Distance between each point pair. num_points (int): Number of points in the sampled set. Returns: Tensor: (B, num_points) indices of the sampled points. """ assert points_dist.is_contiguous() B, N, _ = points_dist.size() output = points_dist.new_zeros([B, num_points], dtype=torch.int32) temp = points_dist.new_zeros([B, N]).fill_(1e10) furthest_point_sample_ext.furthest_point_sampling_with_dist_wrapper( B, N, num_points, points_dist, temp, output) ctx.mark_non_differentiable(output) return output @staticmethod def backward(xyz, a=None): return None, None furthest_point_sample = FurthestPointSampling.apply furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply ================================================ FILE: mmdet3d/ops/furthest_point_sample/points_sampler.py ================================================ import torch from mmcv.runner import force_fp32 from torch import nn as nn from typing import List from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from .utils import calc_square_dist def get_sampler_type(sampler_type): """Get the type and mode of points sampler. Args: sampler_type (str): The type of points sampler. The valid value are "D-FPS", "F-FPS", or "FS". Returns: class: Points sampler type. """ if sampler_type == 'D-FPS': sampler = DFPS_Sampler elif sampler_type == 'F-FPS': sampler = FFPS_Sampler elif sampler_type == 'FS': sampler = FS_Sampler else: raise ValueError('Only "sampler_type" of "D-FPS", "F-FPS", or "FS"' f' are supported, got {sampler_type}') return sampler class Points_Sampler(nn.Module): """Points sampling. Args: num_point (list[int]): Number of sample points. fps_mod_list (list[str]: Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. fps_sample_range_list (list[int]): Range of points to apply FPS. Default: [-1]. """ def __init__(self, num_point: List[int], fps_mod_list: List[str] = ['D-FPS'], fps_sample_range_list: List[int] = [-1]): super(Points_Sampler, self).__init__() # FPS would be applied to different fps_mod in the list, # so the length of the num_point should be equal to # fps_mod_list and fps_sample_range_list. assert len(num_point) == len(fps_mod_list) == len( fps_sample_range_list) self.num_point = num_point self.fps_sample_range_list = fps_sample_range_list self.samplers = nn.ModuleList() for fps_mod in fps_mod_list: self.samplers.append(get_sampler_type(fps_mod)()) self.fp16_enabled = False @force_fp32() def forward(self, points_xyz, features): """forward. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor): (B, C, N) Descriptors of the features. Return: Tensor: (B, npoint, sample_num) Indices of sampled points. """ indices = [] last_fps_end_index = 0 for fps_sample_range, sampler, npoint in zip( self.fps_sample_range_list, self.samplers, self.num_point): assert fps_sample_range < points_xyz.shape[1] if fps_sample_range == -1: sample_points_xyz = points_xyz[:, last_fps_end_index:] sample_features = features[:, :, last_fps_end_index:] else: sample_points_xyz = \ points_xyz[:, last_fps_end_index:fps_sample_range] sample_features = \ features[:, :, last_fps_end_index:fps_sample_range] fps_idx = sampler(sample_points_xyz.contiguous(), sample_features, npoint) indices.append(fps_idx + last_fps_end_index) last_fps_end_index += fps_sample_range indices = torch.cat(indices, dim=1) return indices class DFPS_Sampler(nn.Module): """DFPS_Sampling. Using Euclidean distances of points for FPS. """ def __init__(self): super(DFPS_Sampler, self).__init__() def forward(self, points, features, npoint): """Sampling points with D-FPS.""" fps_idx = furthest_point_sample(points.contiguous(), npoint) return fps_idx class FFPS_Sampler(nn.Module): """FFPS_Sampler. Using feature distances for FPS. """ def __init__(self): super(FFPS_Sampler, self).__init__() def forward(self, points, features, npoint): """Sampling points with F-FPS.""" features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2) features_dist = calc_square_dist( features_for_fps, features_for_fps, norm=False) fps_idx = furthest_point_sample_with_dist(features_dist, npoint) return fps_idx class FS_Sampler(nn.Module): """FS_Sampling. Using F-FPS and D-FPS simultaneously. """ def __init__(self): super(FS_Sampler, self).__init__() def forward(self, points, features, npoint): """Sampling points with FS_Sampling.""" features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2) features_dist = calc_square_dist( features_for_fps, features_for_fps, norm=False) fps_idx_ffps = furthest_point_sample_with_dist(features_dist, npoint) fps_idx_dfps = furthest_point_sample(points, npoint) fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1) return fps_idx ================================================ FILE: mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp #include #include #include #include #include extern THCState *state; int furthest_point_sampling_wrapper(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); void furthest_point_sampling_kernel_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs, cudaStream_t stream); int furthest_point_sampling_with_dist_wrapper(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); void furthest_point_sampling_with_dist_kernel_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs, cudaStream_t stream); int furthest_point_sampling_wrapper(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) { const float *points = points_tensor.data_ptr(); float *temp = temp_tensor.data_ptr(); int *idx = idx_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream); return 1; } int furthest_point_sampling_with_dist_wrapper(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) { const float *points = points_tensor.data(); float *temp = temp_tensor.data(); int *idx = idx_tensor.data(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); furthest_point_sampling_with_dist_kernel_launcher(b, n, m, points, temp, idx, stream); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper"); m.def("furthest_point_sampling_with_dist_wrapper", &furthest_point_sampling_with_dist_wrapper, "furthest_point_sampling_with_dist_wrapper"); } ================================================ FILE: mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu #include #include #define TOTAL_THREADS 1024 #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) inline int opt_n_threads(int work_size) { const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); return max(min(1 << pow_2, TOTAL_THREADS), 1); } __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2) { const float v1 = dists[idx1], v2 = dists[idx2]; const int i1 = dists_i[idx1], i2 = dists_i[idx2]; dists[idx1] = max(v1, v2); dists_i[idx1] = v2 > v1 ? i2 : i1; } template __global__ void furthest_point_sampling_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, 3) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * 3; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; float x1 = dataset[old * 3 + 0]; float y1 = dataset[old * 3 + 1]; float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { float x2, y2, z2; x2 = dataset[k * 3 + 0]; y2 = dataset[k * 3 + 1]; z2 = dataset[k * 3 + 2]; // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); // if (mag <= 1e-3) // continue; float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); if (block_size >= 1024) { if (tid < 512) { __update(dists, dists_i, tid, tid + 512); } __syncthreads(); } if (block_size >= 512) { if (tid < 256) { __update(dists, dists_i, tid, tid + 256); } __syncthreads(); } if (block_size >= 256) { if (tid < 128) { __update(dists, dists_i, tid, tid + 128); } __syncthreads(); } if (block_size >= 128) { if (tid < 64) { __update(dists, dists_i, tid, tid + 64); } __syncthreads(); } if (block_size >= 64) { if (tid < 32) { __update(dists, dists_i, tid, tid + 32); } __syncthreads(); } if (block_size >= 32) { if (tid < 16) { __update(dists, dists_i, tid, tid + 16); } __syncthreads(); } if (block_size >= 16) { if (tid < 8) { __update(dists, dists_i, tid, tid + 8); } __syncthreads(); } if (block_size >= 8) { if (tid < 4) { __update(dists, dists_i, tid, tid + 4); } __syncthreads(); } if (block_size >= 4) { if (tid < 2) { __update(dists, dists_i, tid, tid + 2); } __syncthreads(); } if (block_size >= 2) { if (tid < 1) { __update(dists, dists_i, tid, tid + 1); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } void furthest_point_sampling_kernel_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs, cudaStream_t stream) { // dataset: (B, N, 3) // tmp: (B, N) // output: // idx: (B, M) cudaError_t err; unsigned int n_threads = opt_n_threads(n); switch (n_threads) { case 1024: furthest_point_sampling_kernel<1024> <<>>(b, n, m, dataset, temp, idxs); break; case 512: furthest_point_sampling_kernel<512> <<>>(b, n, m, dataset, temp, idxs); break; case 256: furthest_point_sampling_kernel<256> <<>>(b, n, m, dataset, temp, idxs); break; case 128: furthest_point_sampling_kernel<128> <<>>(b, n, m, dataset, temp, idxs); break; case 64: furthest_point_sampling_kernel<64> <<>>(b, n, m, dataset, temp, idxs); break; case 32: furthest_point_sampling_kernel<32> <<>>(b, n, m, dataset, temp, idxs); break; case 16: furthest_point_sampling_kernel<16> <<>>(b, n, m, dataset, temp, idxs); break; case 8: furthest_point_sampling_kernel<8> <<>>(b, n, m, dataset, temp, idxs); break; case 4: furthest_point_sampling_kernel<4> <<>>(b, n, m, dataset, temp, idxs); break; case 2: furthest_point_sampling_kernel<2> <<>>(b, n, m, dataset, temp, idxs); break; case 1: furthest_point_sampling_kernel<1> <<>>(b, n, m, dataset, temp, idxs); break; default: furthest_point_sampling_kernel<512> <<>>(b, n, m, dataset, temp, idxs); } err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } // Modified from // https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu template __global__ void furthest_point_sampling_with_dist_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, N) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * n; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; // float x1 = dataset[old * 3 + 0]; // float y1 = dataset[old * 3 + 1]; // float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { // float x2, y2, z2; // x2 = dataset[k * 3 + 0]; // y2 = dataset[k * 3 + 1]; // z2 = dataset[k * 3 + 2]; // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * // (z2 - z1); float d = dataset[old * n + k]; float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); if (block_size >= 1024) { if (tid < 512) { __update(dists, dists_i, tid, tid + 512); } __syncthreads(); } if (block_size >= 512) { if (tid < 256) { __update(dists, dists_i, tid, tid + 256); } __syncthreads(); } if (block_size >= 256) { if (tid < 128) { __update(dists, dists_i, tid, tid + 128); } __syncthreads(); } if (block_size >= 128) { if (tid < 64) { __update(dists, dists_i, tid, tid + 64); } __syncthreads(); } if (block_size >= 64) { if (tid < 32) { __update(dists, dists_i, tid, tid + 32); } __syncthreads(); } if (block_size >= 32) { if (tid < 16) { __update(dists, dists_i, tid, tid + 16); } __syncthreads(); } if (block_size >= 16) { if (tid < 8) { __update(dists, dists_i, tid, tid + 8); } __syncthreads(); } if (block_size >= 8) { if (tid < 4) { __update(dists, dists_i, tid, tid + 4); } __syncthreads(); } if (block_size >= 4) { if (tid < 2) { __update(dists, dists_i, tid, tid + 2); } __syncthreads(); } if (block_size >= 2) { if (tid < 1) { __update(dists, dists_i, tid, tid + 1); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } void furthest_point_sampling_with_dist_kernel_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs, cudaStream_t stream) { // dataset: (B, N, N) // temp: (B, N) // output: // idx: (B, M) cudaError_t err; unsigned int n_threads = opt_n_threads(n); switch (n_threads) { case 1024: furthest_point_sampling_with_dist_kernel<1024><<>>( b, n, m, dataset, temp, idxs); break; case 512: furthest_point_sampling_with_dist_kernel<512><<>>( b, n, m, dataset, temp, idxs); break; case 256: furthest_point_sampling_with_dist_kernel<256><<>>( b, n, m, dataset, temp, idxs); break; case 128: furthest_point_sampling_with_dist_kernel<128><<>>( b, n, m, dataset, temp, idxs); break; case 64: furthest_point_sampling_with_dist_kernel<64><<>>( b, n, m, dataset, temp, idxs); break; case 32: furthest_point_sampling_with_dist_kernel<32><<>>( b, n, m, dataset, temp, idxs); break; case 16: furthest_point_sampling_with_dist_kernel<16><<>>( b, n, m, dataset, temp, idxs); break; case 8: furthest_point_sampling_with_dist_kernel<8><<>>( b, n, m, dataset, temp, idxs); break; case 4: furthest_point_sampling_with_dist_kernel<4><<>>( b, n, m, dataset, temp, idxs); break; case 2: furthest_point_sampling_with_dist_kernel<2><<>>( b, n, m, dataset, temp, idxs); break; case 1: furthest_point_sampling_with_dist_kernel<1><<>>( b, n, m, dataset, temp, idxs); break; default: furthest_point_sampling_with_dist_kernel<512><<>>( b, n, m, dataset, temp, idxs); } err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/furthest_point_sample/utils.py ================================================ import torch def calc_square_dist(point_feat_a, point_feat_b, norm=True): """Calculating square distance between a and b. Args: point_feat_a (Tensor): (B, N, C) Feature vector of each point. point_feat_b (Tensor): (B, M, C) Feature vector of each point. norm (Bool): Whether to normalize the distance. Default: True. Returns: Tensor: (B, N, M) Distance between each pair points. """ length_a = point_feat_a.shape[1] length_b = point_feat_b.shape[1] num_channel = point_feat_a.shape[-1] # [bs, n, 1] a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1) # [bs, 1, m] b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1) a_square = a_square.repeat((1, 1, length_b)) # [bs, n, m] b_square = b_square.repeat((1, length_a, 1)) # [bs, n, m] coor = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2)) dist = a_square + b_square - 2 * coor if norm: dist = torch.sqrt(dist) / num_channel return dist ================================================ FILE: mmdet3d/ops/gather_points/__init__.py ================================================ from .gather_points import gather_points __all__ = ['gather_points'] ================================================ FILE: mmdet3d/ops/gather_points/gather_points.py ================================================ import torch from torch.autograd import Function from . import gather_points_ext class GatherPoints(Function): """Gather Points. Gather points with given index. """ @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: """forward. Args: features (Tensor): (B, C, N) features to gather. indices (Tensor): (B, M) where M is the number of points. Returns: Tensor: (B, C, M) where M is the number of points. """ assert features.is_contiguous() assert indices.is_contiguous() B, npoint = indices.size() _, C, N = features.size() output = torch.cuda.FloatTensor(B, C, npoint) gather_points_ext.gather_points_wrapper(B, C, N, npoint, features, indices, output) ctx.for_backwards = (indices, C, N) ctx.mark_non_differentiable(indices) return output @staticmethod def backward(ctx, grad_out): idx, C, N = ctx.for_backwards B, npoint = idx.size() grad_features = torch.cuda.FloatTensor(B, C, N).zero_() grad_out_data = grad_out.data.contiguous() gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data) return grad_features, None gather_points = GatherPoints.apply ================================================ FILE: mmdet3d/ops/gather_points/src/gather_points.cpp ================================================ #include #include #include #include #include extern THCState *state; int gather_points_wrapper(int b, int c, int n, int npoints, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); void gather_points_kernel_launcher(int b, int c, int n, int npoints, const float *points, const int *idx, float *out, cudaStream_t stream); int gather_points_grad_wrapper(int b, int c, int n, int npoints, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints, const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream); int gather_points_wrapper(int b, int c, int n, int npoints, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); float *out = out_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream); return 1; } int gather_points_grad_wrapper(int b, int c, int n, int npoints, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) { const float *grad_out = grad_out_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); float *grad_points = grad_points_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx, grad_points, stream); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("gather_points_wrapper", &gather_points_wrapper, "gather_points_wrapper"); m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper, "gather_points_grad_wrapper"); } ================================================ FILE: mmdet3d/ops/gather_points/src/gather_points_cuda.cu ================================================ #include #include #define TOTAL_THREADS 1024 #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __global__ void gather_points_kernel(int b, int c, int n, int m, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { // points: (B, C, N) // idx: (B, M) // output: // out: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; points += bs_idx * c * n + c_idx * n; out[0] = points[idx[0]]; } void gather_points_kernel_launcher(int b, int c, int n, int npoints, const float *points, const int *idx, float *out, cudaStream_t stream) { // points: (B, C, N) // idx: (B, npoints) // output: // out: (B, C, npoints) cudaError_t err; dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); gather_points_kernel<<>>(b, c, n, npoints, points, idx, out); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } __global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { // grad_out: (B, C, M) // idx: (B, M) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; grad_out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; grad_points += bs_idx * c * n + c_idx * n; atomicAdd(grad_points + idx[0], grad_out[0]); } void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints, const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { // grad_out: (B, C, npoints) // idx: (B, npoints) // output: // grad_points: (B, C, N) cudaError_t err; dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); gather_points_grad_kernel<<>>( b, c, n, npoints, grad_out, idx, grad_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/group_points/__init__.py ================================================ from .group_points import GroupAll, QueryAndGroup, grouping_operation __all__ = ['QueryAndGroup', 'GroupAll', 'grouping_operation'] ================================================ FILE: mmdet3d/ops/group_points/group_points.py ================================================ import torch from torch import nn as nn from torch.autograd import Function from typing import Tuple from ..ball_query import ball_query from . import group_points_ext class QueryAndGroup(nn.Module): """Query and Group. Groups with a ball query of radius Args: max_radius (float): The maximum radius of the balls. sample_num (int): Maximum number of features to gather in the ball. min_radius (float): The minimum radius of the balls. use_xyz (bool): Whether to use xyz. Default: True. return_grouped_xyz (bool): Whether to return grouped xyz. Default: False. normalize_xyz (bool): Whether to normalize xyz. Default: False. uniform_sample (bool): Whether to sample uniformly. Default: False return_unique_cnt (bool): Whether to return the count of unique samples. Default: False. """ def __init__(self, max_radius, sample_num, min_radius=0, use_xyz=True, return_grouped_xyz=False, normalize_xyz=False, uniform_sample=False, return_unique_cnt=False): super(QueryAndGroup, self).__init__() self.max_radius = max_radius self.min_radius = min_radius self.sample_num = sample_num self.use_xyz = use_xyz self.return_grouped_xyz = return_grouped_xyz self.normalize_xyz = normalize_xyz self.uniform_sample = uniform_sample self.return_unique_cnt = return_unique_cnt if self.return_unique_cnt: assert self.uniform_sample def forward(self, points_xyz, center_xyz, features=None): """forward. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. center_xyz (Tensor): (B, npoint, 3) Centriods. features (Tensor): (B, C, N) Descriptors of the features. Return: Tensor: (B, 3 + C, npoint, sample_num) Grouped feature. """ idx = ball_query(self.min_radius, self.max_radius, self.sample_num, points_xyz, center_xyz) if self.uniform_sample: unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) for i_batch in range(idx.shape[0]): for i_region in range(idx.shape[1]): unique_ind = torch.unique(idx[i_batch, i_region, :]) num_unique = unique_ind.shape[0] unique_cnt[i_batch, i_region] = num_unique sample_ind = torch.randint( 0, num_unique, (self.sample_num - num_unique, ), dtype=torch.long) all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) idx[i_batch, i_region, :] = all_ind xyz_trans = points_xyz.transpose(1, 2).contiguous() # (B, 3, npoint, sample_num) grouped_xyz = grouping_operation(xyz_trans, idx) grouped_xyz -= center_xyz.transpose(1, 2).unsqueeze(-1) if self.normalize_xyz: grouped_xyz /= self.max_radius if features is not None: grouped_features = grouping_operation(features, idx) if self.use_xyz: # (B, C + 3, npoint, sample_num) new_features = torch.cat([grouped_xyz, grouped_features], dim=1) else: new_features = grouped_features else: assert (self.use_xyz ), 'Cannot have not features and not use xyz as a feature!' new_features = grouped_xyz ret = [new_features] if self.return_grouped_xyz: ret.append(grouped_xyz) if self.return_unique_cnt: ret.append(unique_cnt) if len(ret) == 1: return ret[0] else: return tuple(ret) class GroupAll(nn.Module): """Group All. Group xyz with feature. Args: use_xyz (bool): Whether to use xyz. """ def __init__(self, use_xyz: bool = True): super().__init__() self.use_xyz = use_xyz def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None): """forward. Args: xyz (Tensor): (B, N, 3) xyz coordinates of the features. new_xyz (Tensor): Ignored. features (Tensor): (B, C, N) features to group. Return: Tensor: (B, C + 3, 1, N) Grouped feature. """ grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) if features is not None: grouped_features = features.unsqueeze(2) if self.use_xyz: new_features = torch.cat([grouped_xyz, grouped_features], dim=1) # (B, 3 + C, 1, N) else: new_features = grouped_features else: new_features = grouped_xyz return new_features class GroupingOperation(Function): """Grouping Operation. Group feature with given index. """ @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: """forward. Args: features (Tensor): (B, C, N) tensor of features to group. indices (Tensor): (B, npoint, nsample) the indicies of features to group with. Returns: Tensor: (B, C, npoint, nsample) Grouped features. """ assert features.is_contiguous() assert indices.is_contiguous() B, nfeatures, nsample = indices.size() _, C, N = features.size() output = torch.cuda.FloatTensor(B, C, nfeatures, nsample) group_points_ext.forward(B, C, N, nfeatures, nsample, features, indices, output) ctx.for_backwards = (indices, N) return output @staticmethod def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """backward. Args: grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients of the output from forward. Returns: Tensor: (B, C, N) gradient of the features. """ idx, N = ctx.for_backwards B, C, npoint, nsample = grad_out.size() grad_features = torch.cuda.FloatTensor(B, C, N).zero_() grad_out_data = grad_out.data.contiguous() group_points_ext.backward(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data) return grad_features, None grouping_operation = GroupingOperation.apply ================================================ FILE: mmdet3d/ops/group_points/src/group_points.cpp ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp #include #include #include #include #include #include extern THCState *state; int group_points_wrapper(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out, cudaStream_t stream); int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample, const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream); int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) { float *grad_points = grad_points_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); const float *grad_out = grad_out_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream); return 1; } int group_points_wrapper(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); float *out = out_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &group_points_wrapper, "group_points_wrapper"); m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper"); } ================================================ FILE: mmdet3d/ops/group_points/src/group_points_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { // grad_out: (B, C, npoints, nsample) // idx: (B, npoints, nsample) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int index = blockIdx.x * blockDim.x + threadIdx.x; int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; int sample_idx = index % nsample; grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); } void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample, const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { // grad_out: (B, C, npoints, nsample) // idx: (B, npoints, nsample) // output: // grad_points: (B, C, N) cudaError_t err; dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); group_points_grad_kernel<<>>( b, c, n, npoints, nsample, grad_out, idx, grad_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } __global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { // points: (B, C, N) // idx: (B, npoints, nsample) // output: // out: (B, C, npoints, nsample) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int index = blockIdx.x * blockDim.x + threadIdx.x; int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx]; } void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out, cudaStream_t stream) { // points: (B, C, N) // idx: (B, npoints, nsample) // output: // out: (B, C, npoints, nsample) cudaError_t err; dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); group_points_kernel<<>>(b, c, n, npoints, nsample, points, idx, out); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/interpolate/__init__.py ================================================ from .three_interpolate import three_interpolate from .three_nn import three_nn __all__ = ['three_nn', 'three_interpolate'] ================================================ FILE: mmdet3d/ops/interpolate/src/interpolate.cpp ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp #include #include #include #include #include #include #include #include #include extern THCState *state; void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); void three_nn_kernel_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx, cudaStream_t stream); void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); void three_interpolate_kernel_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream); void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor); void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points, cudaStream_t stream); void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) { const float *unknown = unknown_tensor.data_ptr(); const float *known = known_tensor.data_ptr(); float *dist2 = dist2_tensor.data_ptr(); int *idx = idx_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream); } void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data_ptr(); const float *weight = weight_tensor.data_ptr(); float *out = out_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream); } void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor) { const float *grad_out = grad_out_tensor.data_ptr(); const float *weight = weight_tensor.data_ptr(); float *grad_points = grad_points_tensor.data_ptr(); const int *idx = idx_tensor.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper"); m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper"); m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper"); } ================================================ FILE: mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu #include #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) { // points: (B, C, M) // idx: (B, N, 3) // weight: (B, N, 3) // output: // out: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; weight += bs_idx * n * 3 + pt_idx * 3; points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; out += bs_idx * c * n + c_idx * n; out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; } void three_interpolate_kernel_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) { // points: (B, C, M) // idx: (B, N, 3) // weight: (B, N, 3) // output: // out: (B, C, N) cudaError_t err; dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); three_interpolate_kernel<<>>(b, c, m, n, points, idx, weight, out); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } __global__ void three_interpolate_grad_kernel( int b, int c, int n, int m, const float *__restrict__ grad_out, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) { // grad_out: (B, C, N) // weight: (B, N, 3) // output: // grad_points: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; grad_out += bs_idx * c * n + c_idx * n + pt_idx; weight += bs_idx * n * 3 + pt_idx * 3; grad_points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); } void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points, cudaStream_t stream) { // grad_out: (B, C, N) // weight: (B, N, 3) // output: // grad_points: (B, C, M) cudaError_t err; dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); three_interpolate_grad_kernel<<>>( b, c, n, m, grad_out, idx, weight, grad_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/interpolate/src/three_nn_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu #include #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { // unknown: (B, N, 3) // known: (B, M, 3) // output: // dist2: (B, N, 3) // idx: (B, N, 3) int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= n) return; unknown += bs_idx * n * 3 + pt_idx * 3; known += bs_idx * m * 3; dist2 += bs_idx * n * 3 + pt_idx * 3; idx += bs_idx * n * 3 + pt_idx * 3; float ux = unknown[0]; float uy = unknown[1]; float uz = unknown[2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; } void three_nn_kernel_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx, cudaStream_t stream) { // unknown: (B, N, 3) // known: (B, M, 3) // output: // dist2: (B, N, 3) // idx: (B, N, 3) cudaError_t err; dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); three_nn_kernel<<>>(b, n, m, unknown, known, dist2, idx); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: mmdet3d/ops/interpolate/three_interpolate.py ================================================ import torch from torch.autograd import Function from typing import Tuple from . import interpolate_ext class ThreeInterpolate(Function): @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """Performs weighted linear interpolation on 3 features. Args: features (Tensor): (B, C, M) Features descriptors to be interpolated from indices (Tensor): (B, n, 3) index three nearest neighbors of the target features in features weight (Tensor): (B, n, 3) weights of interpolation Returns: Tensor: (B, C, N) tensor of the interpolated features """ assert features.is_contiguous() assert indices.is_contiguous() assert weight.is_contiguous() B, c, m = features.size() n = indices.size(1) ctx.three_interpolate_for_backward = (indices, weight, m) output = torch.cuda.FloatTensor(B, c, n) interpolate_ext.three_interpolate_wrapper(B, c, m, n, features, indices, weight, output) return output @staticmethod def backward( ctx, grad_out: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Backward of three interpolate. Args: grad_out (Tensor): (B, C, N) tensor with gradients of outputs Returns: Tensor: (B, C, M) tensor with gradients of features """ idx, weight, m = ctx.three_interpolate_for_backward B, c, n = grad_out.size() grad_features = torch.cuda.FloatTensor(B, c, m).zero_() grad_out_data = grad_out.data.contiguous() interpolate_ext.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data, idx, weight, grad_features.data) return grad_features, None, None three_interpolate = ThreeInterpolate.apply ================================================ FILE: mmdet3d/ops/interpolate/three_nn.py ================================================ import torch from torch.autograd import Function from typing import Tuple from . import interpolate_ext class ThreeNN(Function): @staticmethod def forward(ctx, target: torch.Tensor, source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Find the top-3 nearest neighbors of the target set from the source set. Args: target (Tensor): shape (B, N, 3), points set that needs to find the nearest neighbors. source (Tensor): shape (B, M, 3), points set that is used to find the nearest neighbors of points in target set. Returns: Tensor: shape (B, N, 3), L2 distance of each point in target set to their corresponding nearest neighbors. """ assert target.is_contiguous() assert source.is_contiguous() B, N, _ = target.size() m = source.size(1) dist2 = torch.cuda.FloatTensor(B, N, 3) idx = torch.cuda.IntTensor(B, N, 3) interpolate_ext.three_nn_wrapper(B, N, m, target, source, dist2, idx) ctx.mark_non_differentiable(idx) return torch.sqrt(dist2), idx @staticmethod def backward(ctx, a=None, b=None): return None, None three_nn = ThreeNN.apply ================================================ FILE: mmdet3d/ops/iou3d/__init__.py ================================================ from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu __all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu'] ================================================ FILE: mmdet3d/ops/iou3d/iou3d_utils.py ================================================ import torch from . import iou3d_cuda def boxes_iou_bev(boxes_a, boxes_b): """Calculate boxes IoU in the bird view. Args: boxes_a (torch.Tensor): Input boxes a with shape (M, 5). boxes_b (torch.Tensor): Input boxes b with shape (N, 5). Returns: ans_iou (torch.Tensor): IoU result with shape (M, N). """ ans_iou = boxes_a.new_zeros( torch.Size((boxes_a.shape[0], boxes_b.shape[0]))) iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), ans_iou) return ans_iou def nms_gpu(boxes, scores, thresh, pre_maxsize=None, post_max_size=None): """Nms function with gpu implementation. Args: boxes (torch.Tensor): Input boxes with the shape of [N, 5] ([x1, y1, x2, y2, ry]). scores (torch.Tensor): Scores of boxes with the shape of [N]. thresh (int): Threshold. pre_maxsize (int): Max size of boxes before nms. Default: None. post_maxsize (int): Max size of boxes after nms. Default: None. Returns: torch.Tensor: Indexes after nms. """ order = scores.sort(0, descending=True)[1] if pre_maxsize is not None: order = order[:pre_maxsize] boxes = boxes[order].contiguous() keep = torch.zeros(boxes.size(0), dtype=torch.long) num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh, boxes.device.index) keep = order[keep[:num_out].cuda(boxes.device)].contiguous() if post_max_size is not None: keep = keep[:post_max_size] return keep def nms_normal_gpu(boxes, scores, thresh): """Normal non maximum suppression on GPU. Args: boxes (torch.Tensor): Input boxes with shape (N, 5). scores (torch.Tensor): Scores of predicted boxes with shape (N). thresh (torch.Tensor): Threshold of non maximum suppression. Returns: torch.Tensor: Remaining indices with scores in descending order. """ order = scores.sort(0, descending=True)[1] boxes = boxes[order].contiguous() keep = torch.zeros(boxes.size(0), dtype=torch.long) num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh, boxes.device.index) return order[keep[:num_out].cuda(boxes.device)].contiguous() ================================================ FILE: mmdet3d/ops/iou3d/src/iou3d.cpp ================================================ // Modified from // https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp /* 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) Written by Shaoshuai Shi All Rights Reserved 2019-2020. */ #include #include #include #include #include #define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define CHECK_ERROR(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap); void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou); void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh); void nmsNormalLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh); int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap) { // params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_b: (M, 5) // params ans_overlap: (N, M) CHECK_INPUT(boxes_a); CHECK_INPUT(boxes_b); CHECK_INPUT(ans_overlap); int num_a = boxes_a.size(0); int num_b = boxes_b.size(0); const float *boxes_a_data = boxes_a.data_ptr(); const float *boxes_b_data = boxes_b.data_ptr(); float *ans_overlap_data = ans_overlap.data_ptr(); boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data); return 1; } int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou) { // params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_b: (M, 5) // params ans_overlap: (N, M) CHECK_INPUT(boxes_a); CHECK_INPUT(boxes_b); CHECK_INPUT(ans_iou); int num_a = boxes_a.size(0); int num_b = boxes_b.size(0); const float *boxes_a_data = boxes_a.data_ptr(); const float *boxes_b_data = boxes_b.data_ptr(); float *ans_iou_data = ans_iou.data_ptr(); boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data); return 1; } int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh, int device_id) { // params boxes: (N, 5) [x1, y1, x2, y2, ry] // params keep: (N) CHECK_INPUT(boxes); CHECK_CONTIGUOUS(keep); cudaSetDevice(device_id); int boxes_num = boxes.size(0); const float *boxes_data = boxes.data_ptr(); long *keep_data = keep.data_ptr(); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); unsigned long long *mask_data = NULL; CHECK_ERROR(cudaMalloc((void **)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); // unsigned long long mask_cpu[boxes_num * col_blocks]; // unsigned long long *mask_cpu = new unsigned long long [boxes_num * // col_blocks]; std::vector mask_cpu(boxes_num * col_blocks); // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost)); cudaFree(mask_data); unsigned long long remv_cpu[col_blocks]; memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS; if (!(remv_cpu[nblock] & (1ULL << inblock))) { keep_data[num_to_keep++] = i; unsigned long long *p = &mask_cpu[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv_cpu[j] |= p[j]; } } } if (cudaSuccess != cudaGetLastError()) printf("Error!\n"); return num_to_keep; } int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh, int device_id) { // params boxes: (N, 5) [x1, y1, x2, y2, ry] // params keep: (N) CHECK_INPUT(boxes); CHECK_CONTIGUOUS(keep); cudaSetDevice(device_id); int boxes_num = boxes.size(0); const float *boxes_data = boxes.data_ptr(); long *keep_data = keep.data_ptr(); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); unsigned long long *mask_data = NULL; CHECK_ERROR(cudaMalloc((void **)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); // unsigned long long mask_cpu[boxes_num * col_blocks]; // unsigned long long *mask_cpu = new unsigned long long [boxes_num * // col_blocks]; std::vector mask_cpu(boxes_num * col_blocks); // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost)); cudaFree(mask_data); unsigned long long remv_cpu[col_blocks]; memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS; if (!(remv_cpu[nblock] & (1ULL << inblock))) { keep_data[num_to_keep++] = i; unsigned long long *p = &mask_cpu[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv_cpu[j] |= p[j]; } } } if (cudaSuccess != cudaGetLastError()) printf("Error!\n"); return num_to_keep; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap"); m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou"); m.def("nms_gpu", &nms_gpu, "oriented nms gpu"); m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu"); } ================================================ FILE: mmdet3d/ops/iou3d/src/iou3d_kernel.cu ================================================ // Modified from // https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu /* 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) Written by Shaoshuai Shi All Rights Reserved 2019-2020. */ #include #define THREADS_PER_BLOCK 16 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) //#define DEBUG const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const float EPS = 1e-8; struct Point { float x, y; __device__ Point() {} __device__ Point(double _x, double _y) { x = _x, y = _y; } __device__ void set(float _x, float _y) { x = _x; y = _y; } __device__ Point operator+(const Point &b) const { return Point(x + b.x, y + b.y); } __device__ Point operator-(const Point &b) const { return Point(x - b.x, y - b.y); } }; __device__ inline float cross(const Point &a, const Point &b) { return a.x * b.y - a.y * b.x; } __device__ inline float cross(const Point &p1, const Point &p2, const Point &p0) { return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); } __device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2) { int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); return ret; } __device__ inline int check_in_box2d(const float *box, const Point &p) { // params: box (5) [x1, y1, x2, y2, angle] const float MARGIN = 1e-5; float center_x = (box[0] + box[2]) / 2; float center_y = (box[1] + box[3]) / 2; float angle_cos = cos(-box[4]), angle_sin = sin(-box[4]); // rotate the point in the opposite direction of box float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x; float rot_y = -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y; #ifdef DEBUG printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2], box[3], box[4]); printf( "center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, " "%.3f)\n", center_x, center_y, angle_cos, angle_sin, p.x, p.y, rot_x, rot_y); #endif return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN && rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN); } __device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans) { // fast exclusion if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; // check cross standing float s1 = cross(q0, p1, p0); float s2 = cross(p1, q1, p0); float s3 = cross(p0, q1, q0); float s4 = cross(q1, p1, q0); if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; // calculate intersection of two lines float s5 = cross(q1, p1, p0); if (fabs(s5 - s1) > EPS) { ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); } else { float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; float D = a0 * b1 - a1 * b0; ans.x = (b0 * c1 - b1 * c0) / D; ans.y = (a1 * c0 - a0 * c1) / D; } return 1; } __device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p) { float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x; float new_y = -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } __device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er) { return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); } __device__ inline float box_overlap(const float *box_a, const float *box_b) { // params: box_a (5) [x1, y1, x2, y2, angle] // params: box_b (5) [x1, y1, x2, y2, angle] float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4]; float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4]; Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2); Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2); #ifdef DEBUG printf( "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle); printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y); #endif Point box_a_corners[5]; box_a_corners[0].set(a_x1, a_y1); box_a_corners[1].set(a_x2, a_y1); box_a_corners[2].set(a_x2, a_y2); box_a_corners[3].set(a_x1, a_y2); Point box_b_corners[5]; box_b_corners[0].set(b_x1, b_y1); box_b_corners[1].set(b_x2, b_y1); box_b_corners[2].set(b_x2, b_y2); box_b_corners[3].set(b_x1, b_y2); // get oriented corners float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); for (int k = 0; k < 4; k++) { #ifdef DEBUG printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); #endif rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); #ifdef DEBUG printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); #endif } box_a_corners[4] = box_a_corners[0]; box_b_corners[4] = box_b_corners[0]; // get intersection of lines Point cross_points[16]; Point poly_center; int cnt = 0, flag = 0; poly_center.set(0, 0); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); if (flag) { poly_center = poly_center + cross_points[cnt]; cnt++; } } } // check corners for (int k = 0; k < 4; k++) { if (check_in_box2d(box_a, box_b_corners[k])) { poly_center = poly_center + box_b_corners[k]; cross_points[cnt] = box_b_corners[k]; cnt++; } if (check_in_box2d(box_b, box_a_corners[k])) { poly_center = poly_center + box_a_corners[k]; cross_points[cnt] = box_a_corners[k]; cnt++; } } poly_center.x /= cnt; poly_center.y /= cnt; // sort the points of polygon Point temp; for (int j = 0; j < cnt - 1; j++) { for (int i = 0; i < cnt - j - 1; i++) { if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { temp = cross_points[i]; cross_points[i] = cross_points[i + 1]; cross_points[i + 1] = temp; } } } #ifdef DEBUG printf("cnt=%d\n", cnt); for (int i = 0; i < cnt; i++) { printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y); } #endif // get the overlap areas float area = 0; for (int k = 0; k < cnt - 1; k++) { area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); } return fabs(area) / 2.0; } __device__ inline float iou_bev(const float *box_a, const float *box_b) { // params: box_a (5) [x1, y1, x2, y2, angle] // params: box_b (5) [x1, y1, x2, y2, angle] float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]); float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]); float s_overlap = box_overlap(box_a, box_b); return s_overlap / fmaxf(sa + sb - s_overlap, EPS); } __global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 5; const float *cur_box_b = boxes_b + b_idx * 5; float s_overlap = box_overlap(cur_box_a, cur_box_b); ans_overlap[a_idx * num_b + b_idx] = s_overlap; } __global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou) { const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 5; const float *cur_box_b = boxes_b + b_idx * 5; float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; } __global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 5) [x1, y1, x2, y2, ry] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); mask[cur_box_idx * col_blocks + col_start] = t; } } __device__ inline float iou_normal(float const *const a, float const *const b) { float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float interS = width * height; float Sa = (a[2] - a[0]) * (a[3] - a[1]); float Sb = (b[2] - b[0]) * (b[3] - b[1]); return interS / fmaxf(Sa + Sb - interS, EPS); } __global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 5) [x1, y1, x2, y2, ry] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); mask[cur_box_idx * col_blocks + col_start] = t; } } void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { dim3 blocks( DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); boxes_overlap_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_overlap); #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou) { dim3 blocks( DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); boxes_iou_bev_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_iou); } void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh) { dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); dim3 threads(THREADS_PER_BLOCK_NMS); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); } void nmsNormalLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh) { dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); dim3 threads(THREADS_PER_BLOCK_NMS); nms_normal_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); } ================================================ FILE: mmdet3d/ops/knn/__init__.py ================================================ from .knn import knn __all__ = ['knn'] ================================================ FILE: mmdet3d/ops/knn/knn.py ================================================ import torch from torch.autograd import Function from . import knn_ext class KNN(Function): """KNN (CUDA). Find k-nearest points. """ @staticmethod def forward(ctx, k: int, xyz: torch.Tensor, center_xyz: torch.Tensor, transposed: bool = False) -> torch.Tensor: """forward. Args: k (int): number of nearest neighbors. xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N). xyz coordinates of the features. center_xyz (Tensor): (B, npoint, 3) if transposed == False, else (B, 3, npoint). centers of the knn query. transposed (bool): whether the input tensors are transposed. defaults to False. Should not expicitly use this keyword when calling knn (=KNN.apply), just add the fourth param. Returns: Tensor: (B, k, npoint) tensor with the indicies of the features that form k-nearest neighbours. """ assert k > 0 if not transposed: xyz = xyz.transpose(2, 1).contiguous() center_xyz = center_xyz.transpose(2, 1).contiguous() B, _, npoint = center_xyz.shape N = xyz.shape[2] assert center_xyz.is_contiguous() assert xyz.is_contiguous() center_xyz_device = center_xyz.get_device() assert center_xyz_device == xyz.get_device(), \ 'center_xyz and xyz should be put on the same device' if torch.cuda.current_device() != center_xyz_device: torch.cuda.set_device(center_xyz_device) idx = center_xyz.new_zeros((B, k, npoint)).long() for bi in range(B): knn_ext.knn_wrapper(xyz[bi], N, center_xyz[bi], npoint, idx[bi], k) ctx.mark_non_differentiable(idx) idx -= 1 return idx @staticmethod def backward(ctx, a=None): return None, None knn = KNN.apply ================================================ FILE: mmdet3d/ops/knn/src/knn.cpp ================================================ // Modified from https://github.com/unlimblue/KNN_CUDA #include #include #include #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") #define CHECK_TYPE(x, t) AT_ASSERTM(x.dtype() == t, #x " must be " #t) #define CHECK_CUDA(x) AT_ASSERTM(x.device().type() == at::Device::Type::CUDA, #x " must be on CUDA") #define CHECK_INPUT(x, t) CHECK_CONTIGUOUS(x); CHECK_TYPE(x, t); CHECK_CUDA(x) void knn_kernels_launcher( const float* ref_dev, int ref_nb, const float* query_dev, int query_nb, int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream ); // std::vector knn_wrapper( void knn_wrapper( at::Tensor & ref, int ref_nb, at::Tensor & query, int query_nb, at::Tensor & ind, const int k ) { CHECK_INPUT(ref, at::kFloat); CHECK_INPUT(query, at::kFloat); const float * ref_dev = ref.data_ptr(); const float * query_dev = query.data_ptr(); int dim = query.size(0); auto dist = at::empty({ref_nb, query_nb}, query.options().dtype(at::kFloat)); float * dist_dev = dist.data_ptr(); long * ind_dev = ind.data_ptr(); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); knn_kernels_launcher( ref_dev, ref_nb, query_dev, query_nb, dim, k, dist_dev, ind_dev, stream ); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("knn_wrapper", &knn_wrapper, "knn_wrapper"); } ================================================ FILE: mmdet3d/ops/knn/src/knn_cuda.cu ================================================ /** Modified from https://github.com/unlimblue/KNN_CUDA * which is the modified version of knn-CUDA * from https://github.com/vincentfpgarcia/kNN-CUDA * Last modified by Christopher B. Choy 12/23/2016 * vincentfpgarcia wrote the original cuda code, Christopher modified it and * set it up for pytorch 0.4, and unlimblue updated it to pytorch >= 1.0 */ // Includes #include #include "cuda.h" // Constants used by the program #define BLOCK_DIM 16 #define DEBUG 0 /** * Computes the distance between two matrix A (reference points) and * B (query points) containing respectively wA and wB points. * * @param A pointer on the matrix A * @param wA width of the matrix A = number of points in A * @param B pointer on the matrix B * @param wB width of the matrix B = number of points in B * @param dim dimension of points = height of matrices A and B * @param AB pointer on the matrix containing the wA*wB distances computed */ __global__ void cuComputeDistanceGlobal(const float* A, int wA, const float* B, int wB, int dim, float* AB){ // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM]; __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM]; // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step) __shared__ int begin_A; __shared__ int begin_B; __shared__ int step_A; __shared__ int step_B; __shared__ int end_A; // Thread index int tx = threadIdx.x; int ty = threadIdx.y; // Other variables float tmp; float ssd = 0; // Loop parameters begin_A = BLOCK_DIM * blockIdx.y; begin_B = BLOCK_DIM * blockIdx.x; step_A = BLOCK_DIM * wA; step_B = BLOCK_DIM * wB; end_A = begin_A + (dim-1) * wA; // Conditions int cond0 = (begin_A + tx < wA); // used to write in shared memory int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix // Loop over all the sub-matrices of A and B required to compute the block sub-matrix for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) { // Load the matrices from device memory to shared memory; each thread loads one element of each matrix if (a/wA + ty < dim){ shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0; shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0; } else{ shared_A[ty][tx] = 0; shared_B[ty][tx] = 0; } // Synchronize to make sure the matrices are loaded __syncthreads(); // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix if (cond2 && cond1){ for (int k = 0; k < BLOCK_DIM; ++k){ tmp = shared_A[k][ty] - shared_B[k][tx]; ssd += tmp*tmp; } } // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration __syncthreads(); } // Write the block sub-matrix to device memory; each thread writes one element if (cond2 && cond1) AB[(begin_A + ty) * wB + begin_B + tx] = ssd; } /** * Gathers k-th smallest distances for each column of the distance matrix in the top. * * @param dist distance matrix * @param ind index matrix * @param width width of the distance matrix and of the index matrix * @param height height of the distance matrix and of the index matrix * @param k number of neighbors to consider */ __global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){ // Variables int l, i, j; float *p_dist; long *p_ind; float curr_dist, max_dist; long curr_row, max_row; unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; if (xIndexcurr_dist){ i=a; break; } } for (j=l; j>i; j--){ p_dist[j*width] = p_dist[(j-1)*width]; p_ind[j*width] = p_ind[(j-1)*width]; } p_dist[i*width] = curr_dist; p_ind[i*width] = l + 1; } else { p_ind[l*width] = l + 1; } max_dist = p_dist[curr_row]; } // Part 2 : insert element in the k-th first lines max_row = (k-1)*width; for (l=k; lcurr_dist){ i=a; break; } } for (j=k-1; j>i; j--){ p_dist[j*width] = p_dist[(j-1)*width]; p_ind[j*width] = p_ind[(j-1)*width]; } p_dist[i*width] = curr_dist; p_ind[i*width] = l + 1; max_dist = p_dist[max_row]; } } } } /** * Computes the square root of the first line (width-th first element) * of the distance matrix. * * @param dist distance matrix * @param width width of the distance matrix * @param k number of neighbors to consider */ __global__ void cuParallelSqrt(float *dist, int width, int k){ unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; if (xIndex>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev); #if DEBUG printf("Pre insertionSort\n"); debug(dist_dev, ind_dev, query_nb, k); #endif // Kernel 2: Sort each column cuInsertionSort<<>>(dist_dev, ind_dev, query_nb, ref_nb, k); #if DEBUG printf("Post insertionSort\n"); debug(dist_dev, ind_dev, query_nb, k); #endif // Kernel 3: Compute square root of k first elements cuParallelSqrt<<>>(dist_dev, query_nb, k); } ================================================ FILE: mmdet3d/ops/norm.py ================================================ import torch from mmcv.cnn import NORM_LAYERS from mmcv.runner import force_fp32 from torch import distributed as dist from torch import nn as nn from torch.autograd.function import Function class AllReduce(Function): @staticmethod def forward(ctx, input): input_list = [ torch.zeros_like(input) for k in range(dist.get_world_size()) ] # Use allgather instead of allreduce in-place operations is unreliable dist.all_gather(input_list, input, async_op=False) inputs = torch.stack(input_list, dim=0) return torch.sum(inputs, dim=0) @staticmethod def backward(ctx, grad_output): dist.all_reduce(grad_output, async_op=False) return grad_output @NORM_LAYERS.register_module('naiveSyncBN1d') class NaiveSyncBatchNorm1d(nn.BatchNorm1d): """Syncronized Batch Normalization for 3D Tensors. Note: This implementation is modified from https://github.com/facebookresearch/detectron2/ `torch.nn.SyncBatchNorm` has known unknown bugs. It produces significantly worse AP (and sometimes goes NaN) when the batch size on each worker is quite different (e.g., when scale augmentation is used). In 3D detection, different workers has points of different shapes, whish also cause instability. Use this implementation before `nn.SyncBatchNorm` is fixed. It is slower than `nn.SyncBatchNorm`. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fp16_enabled = False # customized normalization layer still needs this decorator # to force the input to be fp32 and the output to be fp16 # TODO: make mmcv fp16 utils handle customized norm layers @force_fp32(out_fp16=True) def forward(self, input): assert input.dtype == torch.float32, \ f'input should be in float32 type, got {input.dtype}' if dist.get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, 'SyncBN does not support empty inputs' C = input.shape[1] mean = torch.mean(input, dim=[0, 2]) meansqr = torch.mean(input * input, dim=[0, 2]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * ( mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1) bias = bias.reshape(1, -1, 1) return input * scale + bias @NORM_LAYERS.register_module('naiveSyncBN2d') class NaiveSyncBatchNorm2d(nn.BatchNorm2d): """Syncronized Batch Normalization for 4D Tensors. Note: This implementation is modified from https://github.com/facebookresearch/detectron2/ `torch.nn.SyncBatchNorm` has known unknown bugs. It produces significantly worse AP (and sometimes goes NaN) when the batch size on each worker is quite different (e.g., when scale augmentation is used). This phenomenon also occurs when the multi-modality feature fusion modules of multi-modality detectors use SyncBN. Use this implementation before `nn.SyncBatchNorm` is fixed. It is slower than `nn.SyncBatchNorm`. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fp16_enabled = False # customized normalization layer still needs this decorator # to force the input to be fp32 and the output to be fp16 # TODO: make mmcv fp16 utils handle customized norm layers @force_fp32(out_fp16=True) def forward(self, input): assert input.dtype == torch.float32, \ f'input should be in float32 type, got {input.dtype}' if dist.get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, 'SyncBN does not support empty inputs' C = input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * ( mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return input * scale + bias ================================================ FILE: mmdet3d/ops/pointnet_modules/__init__.py ================================================ from .builder import build_sa_module from .point_fp_module import PointFPModule from .point_sa_module import PointSAModule, PointSAModuleMSG __all__ = [ 'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule' ] ================================================ FILE: mmdet3d/ops/pointnet_modules/builder.py ================================================ from .registry import SA_MODULES def build_sa_module(cfg, *args, **kwargs): """Build PointNet2 set abstraction (SA) module. Args: cfg (None or dict): The SA module config, which should contain: - type (str): Module type. - module args: Args needed to instantiate an SA module. args (argument list): Arguments passed to the `__init__` method of the corresponding module. kwargs (keyword arguments): Keyword arguments passed to the `__init__` method of the corresponding SA module . Returns: nn.Module: Created SA module. """ if cfg is None: cfg_ = dict(type='PointSAModule') else: if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() module_type = cfg_.pop('type') if module_type not in SA_MODULES: raise KeyError(f'Unrecognized module type {module_type}') else: sa_module = SA_MODULES.get(module_type) module = sa_module(*args, **kwargs, **cfg_) return module ================================================ FILE: mmdet3d/ops/pointnet_modules/point_fp_module.py ================================================ import torch from mmcv.cnn import ConvModule from mmcv.runner import force_fp32 from torch import nn as nn from typing import List from mmdet3d.ops import three_interpolate, three_nn class PointFPModule(nn.Module): """Point feature propagation module used in PointNets. Propagate the features from one set to another. Args: mlp_channels (list[int]): List of mlp channels. norm_cfg (dict): Type of normalization method. Default: dict(type='BN2d'). """ def __init__(self, mlp_channels: List[int], norm_cfg: dict = dict(type='BN2d')): super().__init__() self.fp16_enabled = False self.mlps = nn.Sequential() for i in range(len(mlp_channels) - 1): self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg)) @force_fp32() def forward(self, target: torch.Tensor, source: torch.Tensor, target_feats: torch.Tensor, source_feats: torch.Tensor) -> torch.Tensor: """forward. Args: target (Tensor): (B, n, 3) tensor of the xyz positions of the target features. source (Tensor): (B, m, 3) tensor of the xyz positions of the source features. target_feats (Tensor): (B, C1, n) tensor of the features to be propagated to. source_feats (Tensor): (B, C2, m) tensor of features to be propagated. Return: Tensor: (B, M, N) M = mlp[-1], tensor of the target features. """ if source is not None: dist, idx = three_nn(target, source) dist_reciprocal = 1.0 / (dist + 1e-8) norm = torch.sum(dist_reciprocal, dim=2, keepdim=True) weight = dist_reciprocal / norm interpolated_feats = three_interpolate(source_feats, idx, weight) else: interpolated_feats = source_feats.expand(*source_feats.size()[0:2], target.size(1)) if target_feats is not None: new_features = torch.cat([interpolated_feats, target_feats], dim=1) # (B, C2 + C1, n) else: new_features = interpolated_feats new_features = new_features.unsqueeze(-1) new_features = self.mlps(new_features) return new_features.squeeze(-1) ================================================ FILE: mmdet3d/ops/pointnet_modules/point_sa_module.py ================================================ import torch from mmcv.cnn import ConvModule from torch import nn as nn from torch.nn import functional as F from typing import List from mmdet3d.ops import GroupAll, Points_Sampler, QueryAndGroup, gather_points from .registry import SA_MODULES @SA_MODULES.register_module() class PointSAModuleMSG(nn.Module): """Point set abstraction module with multi-scale grouping used in Pointnets. Args: num_point (int): Number of points. radii (list[float]): List of radius in each ball query. sample_nums (list[int]): Number of samples in each ball query. mlp_channels (list[int]): Specify of the pointnet before the global pooling for each scale. fps_mod (list[str]: Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. fps_sample_range_list (list[int]): Range of points to apply FPS. Default: [-1]. dilated_group (bool): Whether to use dilated ball query. Default: False. norm_cfg (dict): Type of normalization method. Default: dict(type='BN2d'). use_xyz (bool): Whether to use xyz. Default: True. pool_mod (str): Type of pooling method. Default: 'max_pool'. normalize_xyz (bool): Whether to normalize local XYZ with radius. Default: False. bias (bool | str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". """ def __init__(self, num_point: int, radii: List[float], sample_nums: List[int], mlp_channels: List[List[int]], fps_mod: List[str] = ['D-FPS'], fps_sample_range_list: List[int] = [-1], dilated_group: bool = False, norm_cfg: dict = dict(type='BN2d'), use_xyz: bool = True, pool_mod='max', normalize_xyz: bool = False, bias='auto'): super().__init__() assert len(radii) == len(sample_nums) == len(mlp_channels) assert pool_mod in ['max', 'avg'] assert isinstance(fps_mod, list) or isinstance(fps_mod, tuple) assert isinstance(fps_sample_range_list, list) or isinstance( fps_sample_range_list, tuple) assert len(fps_mod) == len(fps_sample_range_list) if isinstance(mlp_channels, tuple): mlp_channels = list(map(list, mlp_channels)) if isinstance(num_point, int): self.num_point = [num_point] elif isinstance(num_point, list) or isinstance(num_point, tuple): self.num_point = num_point else: raise NotImplementedError('Error type of num_point!') self.pool_mod = pool_mod self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() self.fps_mod_list = fps_mod self.fps_sample_range_list = fps_sample_range_list self.points_sampler = Points_Sampler(self.num_point, self.fps_mod_list, self.fps_sample_range_list) for i in range(len(radii)): radius = radii[i] sample_num = sample_nums[i] if num_point is not None: if dilated_group and i != 0: min_radius = radii[i - 1] else: min_radius = 0 grouper = QueryAndGroup( radius, sample_num, min_radius=min_radius, use_xyz=use_xyz, normalize_xyz=normalize_xyz) else: grouper = GroupAll(use_xyz) self.groupers.append(grouper) mlp_spec = mlp_channels[i] if use_xyz: mlp_spec[0] += 3 mlp = nn.Sequential() for i in range(len(mlp_spec) - 1): mlp.add_module( f'layer{i}', ConvModule( mlp_spec[i], mlp_spec[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg, bias=bias)) self.mlps.append(mlp) def forward( self, points_xyz: torch.Tensor, features: torch.Tensor = None, indices: torch.Tensor = None, target_xyz: torch.Tensor = None, ) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor): """forward. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor): (B, C, N) features of each point. Default: None. indices (Tensor): (B, num_point) Index of the features. Default: None. target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs. Returns: Tensor: (B, M, 3) where M is the number of points. New features xyz. Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number of points. New feature descriptors. Tensor: (B, M) where M is the number of points. Index of the features. """ new_features_list = [] xyz_flipped = points_xyz.transpose(1, 2).contiguous() if indices is not None: assert (indices.shape[1] == self.num_point[0]) new_xyz = gather_points(xyz_flipped, indices).transpose( 1, 2).contiguous() if self.num_point is not None else None elif target_xyz is not None: new_xyz = target_xyz.contiguous() else: indices = self.points_sampler(points_xyz, features) new_xyz = gather_points(xyz_flipped, indices).transpose( 1, 2).contiguous() if self.num_point is not None else None for i in range(len(self.groupers)): # (B, C, num_point, nsample) new_features = self.groupers[i](points_xyz, new_xyz, features) # (B, mlp[-1], num_point, nsample) new_features = self.mlps[i](new_features) if self.pool_mod == 'max': # (B, mlp[-1], num_point, 1) new_features = F.max_pool2d( new_features, kernel_size=[1, new_features.size(3)]) elif self.pool_mod == 'avg': # (B, mlp[-1], num_point, 1) new_features = F.avg_pool2d( new_features, kernel_size=[1, new_features.size(3)]) else: raise NotImplementedError new_features = new_features.squeeze(-1) # (B, mlp[-1], num_point) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1), indices @SA_MODULES.register_module() class PointSAModule(PointSAModuleMSG): """Point set abstraction module used in Pointnets. Args: mlp_channels (list[int]): Specify of the pointnet before the global pooling for each scale. num_point (int): Number of points. Default: None. radius (float): Radius to group with. Default: None. num_sample (int): Number of samples in each ball query. Default: None. norm_cfg (dict): Type of normalization method. Default: dict(type='BN2d'). use_xyz (bool): Whether to use xyz. Default: True. pool_mod (str): Type of pooling method. Default: 'max_pool'. fps_mod (list[str]: Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. fps_sample_range_list (list[int]): Range of points to apply FPS. Default: [-1]. normalize_xyz (bool): Whether to normalize local XYZ with radius. Default: False. """ def __init__(self, mlp_channels: List[int], num_point: int = None, radius: float = None, num_sample: int = None, norm_cfg: dict = dict(type='BN2d'), use_xyz: bool = True, pool_mod: str = 'max', fps_mod: List[str] = ['D-FPS'], fps_sample_range_list: List[int] = [-1], normalize_xyz: bool = False): super().__init__( mlp_channels=[mlp_channels], num_point=num_point, radii=[radius], sample_nums=[num_sample], norm_cfg=norm_cfg, use_xyz=use_xyz, pool_mod=pool_mod, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, normalize_xyz=normalize_xyz) ================================================ FILE: mmdet3d/ops/pointnet_modules/registry.py ================================================ from mmcv.utils import Registry SA_MODULES = Registry('point_sa_module') ================================================ FILE: mmdet3d/ops/roiaware_pool3d/__init__.py ================================================ from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu, points_in_boxes_gpu) from .roiaware_pool3d import RoIAwarePool3d __all__ = [ 'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu', 'points_in_boxes_batch' ] ================================================ FILE: mmdet3d/ops/roiaware_pool3d/points_in_boxes.py ================================================ import torch from . import roiaware_pool3d_ext def points_in_boxes_gpu(points, boxes): """Find points that are in boxes (CUDA) Args: points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate boxes (torch.Tensor): [B, T, 7], num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate, (x, y, z) is the bottom center Returns: box_idxs_of_pts (torch.Tensor): (B, M), default background = -1 """ assert boxes.shape[0] == points.shape[0], \ f'Points and boxes should have the same batch size, ' \ f'got {boxes.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ f'boxes dimension should be 7, ' \ f'got unexpected shape {boxes.shape[2]}' assert points.shape[2] == 3, \ f'points dimension should be 3, ' \ f'got unexpected shape {points.shape[2]}' batch_size, num_points, _ = points.shape box_idxs_of_pts = points.new_zeros((batch_size, num_points), dtype=torch.int).fill_(-1) # If manually put the tensor 'points' or 'boxes' on a device # which is not the current device, some temporary variables # will be created on the current device in the cuda op, # and the output will be incorrect. # Therefore, we force the current device to be the same # as the device of the tensors if it was not. # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305 # for the incorrect output before the fix. points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) roiaware_pool3d_ext.points_in_boxes_gpu(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) return box_idxs_of_pts def points_in_boxes_cpu(points, boxes): """Find points that are in boxes (CPU) Note: Currently, the output of this function is different from that of points_in_boxes_gpu. Args: points (torch.Tensor): [npoints, 3] boxes (torch.Tensor): [N, 7], in LiDAR coordinate, (x, y, z) is the bottom center Returns: point_indices (torch.Tensor): (N, npoints) """ # TODO: Refactor this function as a CPU version of points_in_boxes_gpu assert boxes.shape[1] == 7, \ f'boxes dimension should be 7, ' \ f'got unexpected shape {boxes.shape[2]}' assert points.shape[1] == 3, \ f'points dimension should be 3, ' \ f'got unexpected shape {points.shape[2]}' point_indices = points.new_zeros((boxes.shape[0], points.shape[0]), dtype=torch.int) roiaware_pool3d_ext.points_in_boxes_cpu(boxes.float().contiguous(), points.float().contiguous(), point_indices) return point_indices def points_in_boxes_batch(points, boxes): """Find points that are in boxes (CUDA) Args: points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate boxes (torch.Tensor): [B, T, 7], num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate, (x, y, z) is the bottom center. Returns: box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0 """ assert boxes.shape[0] == points.shape[0], \ f'Points and boxes should have the same batch size, ' \ f'got {boxes.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ f'boxes dimension should be 7, ' \ f'got unexpected shape {boxes.shape[2]}' assert points.shape[2] == 3, \ f'points dimension should be 3, ' \ f'got unexpected shape {points.shape[2]}' batch_size, num_points, _ = points.shape num_boxes = boxes.shape[1] box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes), dtype=torch.int).fill_(0) # Same reason as line 25-32 points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) return box_idxs_of_pts ================================================ FILE: mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py ================================================ import mmcv import torch from torch import nn as nn from torch.autograd import Function from . import roiaware_pool3d_ext class RoIAwarePool3d(nn.Module): def __init__(self, out_size, max_pts_per_voxel=128, mode='max'): super().__init__() """RoIAwarePool3d module Args: out_size (int or tuple): n or [n1, n2, n3] max_pts_per_voxel (int): m mode (str): 'max' or 'avg' """ self.out_size = out_size self.max_pts_per_voxel = max_pts_per_voxel assert mode in ['max', 'avg'] pool_method_map = {'max': 0, 'avg': 1} self.mode = pool_method_map[mode] def forward(self, rois, pts, pts_feature): """RoIAwarePool3d module forward. Args: rois (torch.Tensor): [N, 7],in LiDAR coordinate, (x, y, z) is the bottom center of rois pts (torch.Tensor): [npoints, 3] pts_feature (torch.Tensor): [npoints, C] Returns: pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C] """ return RoIAwarePool3dFunction.apply(rois, pts, pts_feature, self.out_size, self.max_pts_per_voxel, self.mode) class RoIAwarePool3dFunction(Function): @staticmethod def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel, mode): """RoIAwarePool3d function forward. Args: rois (torch.Tensor): [N, 7], in LiDAR coordinate, (x, y, z) is the bottom center of rois pts (torch.Tensor): [npoints, 3] pts_feature (torch.Tensor): [npoints, C] out_size (int or tuple): n or [n1, n2, n3] max_pts_per_voxel (int): m mode (int): 0 (max pool) or 1 (average pool) Returns: pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C] """ if isinstance(out_size, int): out_x = out_y = out_z = out_size else: assert len(out_size) == 3 assert mmcv.is_tuple_of(out_size, int) out_x, out_y, out_z = out_size num_rois = rois.shape[0] num_channels = pts_feature.shape[-1] num_pts = pts.shape[0] pooled_features = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, num_channels)) argmax = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int) pts_idx_of_voxels = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int) roiaware_pool3d_ext.forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, mode) ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode, num_pts, num_channels) return pooled_features @staticmethod def backward(ctx, grad_out): """RoIAwarePool3d function forward. Args: grad_out (torch.Tensor): [N, out_x, out_y, out_z, C] Returns: grad_in (torch.Tensor): [npoints, C] """ ret = ctx.roiaware_pool3d_for_backward pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret grad_in = grad_out.new_zeros((num_pts, num_channels)) roiaware_pool3d_ext.backward(pts_idx_of_voxels, argmax, grad_out.contiguous(), grad_in, mode) return None, None, grad_in, None, None, None if __name__ == '__main__': pass ================================================ FILE: mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp ================================================ // Modified from // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // Written by Shaoshuai Shi // All Rights Reserved 2019. #include #include #include #include #include #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") // #define DEBUG inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { // should rotate pi/2 + alpha to translate LiDAR to local float rot_angle = rz + M_PI / 2; float cosa = cos(rot_angle), sina = sin(rot_angle); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; cz += h / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > h / 2.0) return 0; lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); return in_flag; } int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor) { // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z] // in LiDAR coordinate params pts_indices: (N, npoints) CHECK_CONTIGUOUS(boxes_tensor); CHECK_CONTIGUOUS(pts_tensor); CHECK_CONTIGUOUS(pts_indices_tensor); int boxes_num = boxes_tensor.size(0); int pts_num = pts_tensor.size(0); const float *boxes = boxes_tensor.data_ptr(); const float *pts = pts_tensor.data_ptr(); int *pts_indices = pts_indices_tensor.data_ptr(); float local_x = 0, local_y = 0; for (int i = 0; i < boxes_num; i++) { for (int j = 0; j < pts_num; j++) { int cur_in_flag = check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y); pts_indices[i * pts_num + j] = cur_in_flag; } } return 1; } ================================================ FILE: mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu ================================================ // Modified from // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // Written by Shaoshuai Shi // All Rights Reserved 2019. #include #include #include #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) // #define DEBUG __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { // should rotate pi/2 + alpha to translate LiDAR to local float rot_angle = rz + M_PI / 2; float cosa = cos(rot_angle), sina = sin(rot_angle); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; cz += h / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > h / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); return in_flag; } __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes, const float *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= batch_size || pt_idx >= pts_num) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num + pt_idx; float local_x = 0, local_y = 0; int cur_in_flag = 0; for (int k = 0; k < boxes_num; k++) { cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[0] = k; break; } } } __global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes, const float *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= batch_size || pt_idx >= pts_num) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; float local_x = 0, local_y = 0; int cur_in_flag = 0; for (int k = 0; k < boxes_num; k++) { cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[k] = 1; } cur_in_flag = 0; } } void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes, const float *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 cudaError_t err; dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 threads(THREADS_PER_BLOCK); points_in_boxes_kernel<<>>(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes, const float *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1 cudaError_t err; dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 threads(THREADS_PER_BLOCK); points_in_boxes_batch_kernel<<>>( batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x, // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default // -1 CHECK_INPUT(boxes_tensor); CHECK_INPUT(pts_tensor); CHECK_INPUT(box_idx_of_points_tensor); int batch_size = boxes_tensor.size(0); int boxes_num = boxes_tensor.size(1); int pts_num = pts_tensor.size(1); const float *boxes = boxes_tensor.data_ptr(); const float *pts = pts_tensor.data_ptr(); int *box_idx_of_points = box_idx_of_points_tensor.data_ptr(); points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); return 1; } int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor) { // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR // coordinate params boxes_idx_of_points: (B, npoints), default -1 CHECK_INPUT(boxes_tensor); CHECK_INPUT(pts_tensor); CHECK_INPUT(box_idx_of_points_tensor); int batch_size = boxes_tensor.size(0); int boxes_num = boxes_tensor.size(1); int pts_num = pts_tensor.size(1); const float *boxes = boxes_tensor.data_ptr(); const float *pts = pts_tensor.data_ptr(); int *box_idx_of_points = box_idx_of_points_tensor.data_ptr(); points_in_boxes_batch_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); return 1; } ================================================ FILE: mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp ================================================ // Modified from // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // Written by Shaoshuai Shi // All Rights Reserved 2019. #include #include #include #define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method); void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel, const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method); int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method); int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method); int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor); int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor); int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor); int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method) { // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // params pts_feature: (npoints, C) // params argmax: (N, out_x, out_y, out_z, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pooled_features: (N, out_x, out_y, out_z, C) // params pool_method: 0: max_pool 1: avg_pool CHECK_INPUT(rois); CHECK_INPUT(pts); CHECK_INPUT(pts_feature); CHECK_INPUT(argmax); CHECK_INPUT(pts_idx_of_voxels); CHECK_INPUT(pooled_features); int boxes_num = rois.size(0); int pts_num = pts.size(0); int channels = pts_feature.size(1); int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter int out_x = pts_idx_of_voxels.size(1); int out_y = pts_idx_of_voxels.size(2); int out_z = pts_idx_of_voxels.size(3); assert((out_x < 256) && (out_y < 256) && (out_z < 256)); // we encode index with 8bit const float *rois_data = rois.data_ptr(); const float *pts_data = pts.data_ptr(); const float *pts_feature_data = pts_feature.data_ptr(); int *argmax_data = argmax.data_ptr(); int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr(); float *pooled_features_data = pooled_features.data_ptr(); roiaware_pool3d_launcher( boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, rois_data, pts_data, pts_feature_data, argmax_data, pts_idx_of_voxels_data, pooled_features_data, pool_method); return 1; } int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method) { // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params argmax: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value // params pool_method: 0: max_pool 1: avg_pool CHECK_INPUT(pts_idx_of_voxels); CHECK_INPUT(argmax); CHECK_INPUT(grad_out); CHECK_INPUT(grad_in); int boxes_num = pts_idx_of_voxels.size(0); int out_x = pts_idx_of_voxels.size(1); int out_y = pts_idx_of_voxels.size(2); int out_z = pts_idx_of_voxels.size(3); int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter int channels = grad_out.size(4); const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr(); const int *argmax_data = argmax.data_ptr(); const float *grad_out_data = grad_out.data_ptr(); float *grad_in_data = grad_in.data_ptr(); roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel, pts_idx_of_voxels_data, argmax_data, grad_out_data, grad_in_data, pool_method); return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)"); m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)"); m.def("points_in_boxes_gpu", &points_in_boxes_gpu, "points_in_boxes_gpu forward (CUDA)"); m.def("points_in_boxes_batch", &points_in_boxes_batch, "points_in_boxes_batch forward (CUDA)"); m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CPU)"); } ================================================ FILE: mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu ================================================ // Modified from // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // Written by Shaoshuai Shi // All Rights Reserved 2019. #include #include #include #include #include #define THREADS_PER_BLOCK 256 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) // #define DEBUG __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y) { // should rotate pi/2 + alpha to translate LiDAR to local float rot_angle = rz + M_PI / 2; float cosa = cos(rot_angle), sina = sin(rot_angle); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the // bottom center float x = pt[0], y = pt[1], z = pt[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; cz += h / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > h / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); return in_flag; } __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z, const float *rois, const float *pts, int *pts_mask) { // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] // params pts_mask: (N, npoints): -1 means point doesnot in this box, // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.y; if (pt_idx >= pts_num || box_idx >= boxes_num) return; pts += pt_idx * 3; rois += box_idx * 7; pts_mask += box_idx * pts_num + pt_idx; float local_x = 0, local_y = 0; int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); pts_mask[0] = -1; if (cur_in_flag > 0) { float local_z = pts[2] - rois[2]; float w = rois[3], l = rois[4], h = rois[5]; float x_res = l / out_x; float y_res = w / out_y; float z_res = h / out_z; unsigned int x_idx = int((local_x + l / 2) / x_res); unsigned int y_idx = int((local_y + w / 2) / y_res); unsigned int z_idx = int(local_z / z_res); x_idx = min(max(x_idx, 0), out_x - 1); y_idx = min(max(y_idx, 0), out_y - 1); z_idx = min(max(z_idx, 0), out_z - 1); unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; #ifdef DEBUG printf( "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, " "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n", pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx, z_idx, x_res, y_res, z_res, idx_encoding); #endif pts_mask[0] = idx_encoding; } } __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel, int out_x, int out_y, int out_z, const int *pts_mask, int *pts_idx_of_voxels) { // params pts_mask: (N, npoints) 0 or 1 // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) int box_idx = blockIdx.x * blockDim.x + threadIdx.x; if (box_idx >= boxes_num) return; int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; for (int k = 0; k < pts_num; k++) { if (pts_mask[box_idx * pts_num + k] != -1) { unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; unsigned int x_idx = (idx_encoding >> 16) & 0xFF; unsigned int y_idx = (idx_encoding >> 8) & 0xFF; unsigned int z_idx = idx_encoding & 0xFF; unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel; unsigned int cnt = pts_idx_of_voxels[base_offset]; if (cnt < max_num_pts) { pts_idx_of_voxels[base_offset + cnt + 1] = k; pts_idx_of_voxels[base_offset]++; } #ifdef DEBUG printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx, y_idx, z_idx, idx_encoding); #endif } } } __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features, int *argmax) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; #ifdef DEBUG printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels, argmax); #endif int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int argmax_idx = -1; float max_val = -1e50; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; argmax_idx = pts_idx_of_voxels[k]; } } if (argmax_idx != -1) { pooled_features[0] = max_val; } argmax[0] = argmax_idx; #ifdef DEBUG printf( "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after " "pts_idx: %p, argmax: (%p, %d)\n", channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts, pts_idx_of_voxels, argmax, argmax_idx); #endif } __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; float sum_val = 0; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; } if (total_pts > 0) { pooled_features[0] = sum_val / total_pts; } } void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method) { // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // params pts_feature: (npoints, C) // params argmax: (N, out_x, out_y, out_z, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pooled_features: (N, out_x, out_y, out_z, C) // params pool_method: 0: max_pool 1: avg_pool int *pts_mask = NULL; cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int)); // (N, M) cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int)); dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num); dim3 threads(THREADS_PER_BLOCK); generate_pts_mask_for_box3d<<>>( boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask); // TODO: Merge the collect and pool functions, SS dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK)); collect_inside_pts_for_box3d<<>>( boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask, pts_idx_of_voxels); dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num); if (pool_method == 0) { roiaware_maxpool3d<<>>( boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, pts_feature, pts_idx_of_voxels, pooled_features, argmax); } else if (pool_method == 1) { roiaware_avgpool3d<<>>( boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, pts_feature, pts_idx_of_voxels, pooled_features); } cudaFree(pts_mask); #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, const int *argmax, const float *grad_out, float *grad_in) { // params argmax: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; if (argmax[0] == -1) return; atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); } __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, int max_pts_each_voxel, const int *pts_idx_of_voxels, const float *grad_out, float *grad_in) { // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int total_pts = pts_idx_of_voxels[0]; float cur_grad = 1 / fmaxf(float(total_pts), 1.0); for (int k = 1; k <= total_pts; k++) { atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad); } } void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel, const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method) { // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params argmax: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value // params pool_method: 0: max_pool, 1: avg_pool dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num); dim3 threads(THREADS_PER_BLOCK); if (pool_method == 0) { roiaware_maxpool3d_backward<<>>( boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in); } else if (pool_method == 1) { roiaware_avgpool3d_backward<<>>( boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel, pts_idx_of_voxels, grad_out, grad_in); } } ================================================ FILE: mmdet3d/ops/sparse_block.py ================================================ from mmcv.cnn import build_conv_layer, build_norm_layer from torch import nn from mmdet3d.ops import spconv from mmdet.models.backbones.resnet import BasicBlock, Bottleneck def replace_feature(out, new_features): if 'replace_feature' in out.__dir__(): # spconv 2.x behaviour return out.replace_feature(new_features) else: out.features = new_features return out class SparseBottleneck(Bottleneck, spconv.SparseModule): """Sparse bottleneck block for PartA^2. Bottleneck block implemented with submanifold sparse convolution. Args: inplanes (int): inplanes of block. planes (int): planes of block. stride (int): stride of the first block. Default: 1 downsample (None | Module): down sample module for block. conv_cfg (dict): dictionary to construct and config conv layer. Default: None norm_cfg (dict): dictionary to construct and config norm layer. Default: dict(type='BN') """ expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, conv_cfg=None, norm_cfg=None): spconv.SparseModule.__init__(self) Bottleneck.__init__( self, inplanes, planes, stride=stride, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg) def forward(self, x): identity = x.features out = self.conv1(x) # out.features = self.bn1(out.features) # out.features = self.relu(out.features) out = replace_feature(out, self.bn1(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv2(out) # out.features = self.bn2(out.features) # out.features = self.relu(out.features) out = replace_feature(out, self.bn2(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv3(out) # out.features = self.bn3(out.features) out = replace_feature(out, self.bn3(out.features)) if self.downsample is not None: identity = self.downsample(x) # out.features += identity # out.features = self.relu(out.features) out = replace_feature(out, out.features + identity) out = replace_feature(out, self.relu(out.features)) return out class SparseBasicBlock(BasicBlock, spconv.SparseModule): """Sparse basic block for PartA^2. Sparse basic block implemented with submanifold sparse convolution. Args: inplanes (int): inplanes of block. planes (int): planes of block. stride (int): stride of the first block. Default: 1 downsample (None | Module): down sample module for block. conv_cfg (dict): dictionary to construct and config conv layer. Default: None norm_cfg (dict): dictionary to construct and config norm layer. Default: dict(type='BN') """ expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, conv_cfg=None, norm_cfg=None): spconv.SparseModule.__init__(self) BasicBlock.__init__( self, inplanes, planes, stride=stride, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg) def forward(self, x): identity = x.features assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}' out = self.conv1(x) # out.features = self.norm1(out.features) # out.features = self.relu(out.features) out = replace_feature(out, self.norm1(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv2(out) # out.features = self.norm2(out.features) out = replace_feature(out, self.norm2(out.features)) if self.downsample is not None: identity = self.downsample(x) # out.features += identity # out.features = self.relu(out.features) out = replace_feature(out, out.features + identity) out = replace_feature(out, self.relu(out.features)) return out def make_sparse_convmodule(in_channels, out_channels, kernel_size, indice_key, stride=1, padding=0, conv_type='SubMConv3d', norm_cfg=None, order=('conv', 'norm', 'act')): """Make sparse convolution module. Args: in_channels (int): the number of input channels out_channels (int): the number of out channels kernel_size (int|tuple(int)): kernel size of convolution indice_key (str): the indice key used for sparse tensor stride (int|tuple(int)): the stride of convolution padding (int or list[int]): the padding number of input conv_type (str): sparse conv type in spconv norm_cfg (dict[str]): config of normalization layer order (tuple[str]): The order of conv/norm/activation layers. It is a sequence of "conv", "norm" and "act". Common examples are ("conv", "norm", "act") and ("act", "conv", "norm"). Returns: spconv.SparseSequential: sparse convolution module. """ assert isinstance(order, tuple) and len(order) <= 3 assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'} conv_cfg = dict(type=conv_type, indice_key=indice_key) layers = list() for layer in order: if layer == 'conv': if conv_type not in [ 'SparseInverseConv3d', 'SparseInverseConv2d', 'SparseInverseConv1d' ]: layers.append( build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False)) else: layers.append( build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, bias=False)) elif layer == 'norm': layers.append(build_norm_layer(norm_cfg, out_channels)[1]) elif layer == 'act': layers.append(nn.ReLU(inplace=True)) layers = spconv.SparseSequential(*layers) return layers ================================================ FILE: mmdet3d/ops/spconv/__init__.py ================================================ # Copyright 2019 Yan Yan # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, # SparseConvTranspose3d, SparseInverseConv2d, # SparseInverseConv3d, SubMConv2d, SubMConv3d) # from .modules import SparseModule, SparseSequential # from .pool import SparseMaxPool2d, SparseMaxPool3d # from .structure import SparseConvTensor, scatter_nd from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, SparseConvTranspose3d, SparseInverseConv2d, SparseInverseConv3d, SubMConv2d, SubMConv3d) from spconv.pytorch.modules import SparseModule, SparseSequential from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d from spconv.pytorch.core import SparseConvTensor, scatter_nd from .overwrite_spconv.write_spconv2 import register_spconv2 register_spconv2() __all__ = [ 'SparseConv2d', 'SparseConv3d', 'SubMConv2d', 'SubMConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d', 'SparseInverseConv2d', 'SparseInverseConv3d', 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d', 'SparseConvTensor', 'scatter_nd', ] ================================================ FILE: mmdet3d/ops/spconv/conv.py ================================================ # Copyright 2019 Yan Yan # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import numpy as np import torch from mmcv.cnn import CONV_LAYERS from torch.nn import init from torch.nn.parameter import Parameter from . import functional as Fsp from . import ops from .modules import SparseModule from .structure import SparseConvTensor def _calculate_fan_in_and_fan_out_hwio(tensor): dimensions = tensor.ndimension() if dimensions < 2: raise ValueError('fan in and fan out can not be computed for tensor' 'with fewer than 2 dimensions') if dimensions == 2: # Linear fan_in = tensor.size(-2) fan_out = tensor.size(-1) else: num_input_fmaps = tensor.size(-2) num_output_fmaps = tensor.size(-1) receptive_field_size = 1 if tensor.dim() > 2: receptive_field_size = tensor[..., 0, 0].numel() fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out class SparseConvolution(SparseModule): def __init__(self, ndim, in_channels, out_channels, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, bias=True, subm=False, output_padding=0, transposed=False, inverse=False, indice_key=None, fused_bn=False): super(SparseConvolution, self).__init__() assert groups == 1 if not isinstance(kernel_size, (list, tuple)): kernel_size = [kernel_size] * ndim if not isinstance(stride, (list, tuple)): stride = [stride] * ndim if not isinstance(padding, (list, tuple)): padding = [padding] * ndim if not isinstance(dilation, (list, tuple)): dilation = [dilation] * ndim if not isinstance(output_padding, (list, tuple)): output_padding = [output_padding] * ndim for d, s in zip(dilation, stride): assert any([s == 1, d == 1]), "don't support this." self.ndim = ndim self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.conv1x1 = np.prod(kernel_size) == 1 self.stride = stride self.padding = padding self.dilation = dilation self.transposed = transposed self.inverse = inverse self.output_padding = output_padding self.groups = groups self.subm = subm self.indice_key = indice_key self.fused_bn = fused_bn self.weight = Parameter( torch.Tensor(*kernel_size, in_channels, out_channels)) if bias: self.bias = Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight) bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound) def forward(self, input): assert isinstance(input, SparseConvTensor) features = input.features device = features.device indices = input.indices spatial_shape = input.spatial_shape batch_size = input.batch_size if not self.subm: if self.transposed: out_spatial_shape = ops.get_deconv_output_size( spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding) else: out_spatial_shape = ops.get_conv_output_size( spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation) else: out_spatial_shape = spatial_shape # input.update_grid(out_spatial_shape) # t = time.time() if self.conv1x1: features = torch.mm( input.features, self.weight.view(self.in_channels, self.out_channels)) if self.bias is not None: features += self.bias out_tensor = SparseConvTensor(features, input.indices, input.spatial_shape, input.batch_size) out_tensor.indice_dict = input.indice_dict out_tensor.grid = input.grid return out_tensor datas = input.find_indice_pair(self.indice_key) if self.inverse: assert datas is not None and self.indice_key is not None _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas assert indice_pairs.shape[0] == np.prod( self.kernel_size ), 'inverse conv must have same kernel size as its couple conv' else: if self.indice_key is not None and datas is not None: outids, _, indice_pairs, indice_pair_num, _ = datas else: outids, indice_pairs, indice_pair_num = ops.get_indice_pairs( indices, batch_size, spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid) input.indice_dict[self.indice_key] = (outids, indices, indice_pairs, indice_pair_num, spatial_shape) if self.fused_bn: assert self.bias is not None out_features = ops.fused_indice_conv(features, self.weight, self.bias, indice_pairs.to(device), indice_pair_num, outids.shape[0], self.inverse, self.subm) else: if self.subm: out_features = Fsp.indice_subm_conv(features, self.weight, indice_pairs.to(device), indice_pair_num, outids.shape[0]) else: if self.inverse: out_features = Fsp.indice_inverse_conv( features, self.weight, indice_pairs.to(device), indice_pair_num, outids.shape[0]) else: out_features = Fsp.indice_conv(features, self.weight, indice_pairs.to(device), indice_pair_num, outids.shape[0]) if self.bias is not None: out_features += self.bias out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape, batch_size) out_tensor.indice_dict = input.indice_dict out_tensor.grid = input.grid return out_tensor @CONV_LAYERS.register_module() class SparseConv2d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SparseConv2d, self).__init__( 2, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseConv3d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SparseConv3d, self).__init__( 3, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseConv4d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SparseConv4d, self).__init__( 4, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseConvTranspose2d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SparseConvTranspose2d, self).__init__( 2, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, transposed=True, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseConvTranspose3d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SparseConvTranspose3d, self).__init__( 3, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, transposed=True, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseInverseConv2d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, indice_key, bias=True): super(SparseInverseConv2d, self).__init__( 2, in_channels, out_channels, kernel_size, bias=bias, inverse=True, indice_key=indice_key) @CONV_LAYERS.register_module() class SparseInverseConv3d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, indice_key, bias=True): super(SparseInverseConv3d, self).__init__( 3, in_channels, out_channels, kernel_size, bias=bias, inverse=True, indice_key=indice_key) @CONV_LAYERS.register_module() class SubMConv2d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SubMConv2d, self).__init__( 2, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, True, indice_key=indice_key) @CONV_LAYERS.register_module() class SubMConv3d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SubMConv3d, self).__init__( 3, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, True, indice_key=indice_key) @CONV_LAYERS.register_module() class SubMConv4d(SparseConvolution): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None): super(SubMConv4d, self).__init__( 4, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, True, indice_key=indice_key) ================================================ FILE: mmdet3d/ops/spconv/functional.py ================================================ # Copyright 2019 Yan Yan # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from torch.autograd import Function from . import ops as ops class SparseConvFunction(Function): @staticmethod def forward(ctx, features, filters, indice_pairs, indice_pair_num, num_activate_out): ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False) @staticmethod def backward(ctx, grad_output): indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors input_bp, filters_bp = ops.indice_conv_backward( features, filters, grad_output, indice_pairs, indice_pair_num, False) return input_bp, filters_bp, None, None, None class SparseInverseConvFunction(Function): @staticmethod def forward(ctx, features, filters, indice_pairs, indice_pair_num, num_activate_out): ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, True, False) @staticmethod def backward(ctx, grad_output): indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors input_bp, filters_bp = ops.indice_conv_backward( features, filters, grad_output, indice_pairs, indice_pair_num, True, False) return input_bp, filters_bp, None, None, None class SubMConvFunction(Function): @staticmethod def forward(ctx, features, filters, indice_pairs, indice_pair_num, num_activate_out): ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False, True) @staticmethod def backward(ctx, grad_output): indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors input_bp, filters_bp = ops.indice_conv_backward( features, filters, grad_output, indice_pairs, indice_pair_num, False, True) return input_bp, filters_bp, None, None, None class SparseMaxPoolFunction(Function): @staticmethod def forward(ctx, features, indice_pairs, indice_pair_num, num_activate_out): out = ops.indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out) ctx.save_for_backward(indice_pairs, indice_pair_num, features, out) return out @staticmethod def backward(ctx, grad_output): indice_pairs, indice_pair_num, features, out = ctx.saved_tensors input_bp = ops.indice_maxpool_backward(features, out, grad_output, indice_pairs, indice_pair_num) return input_bp, None, None, None indice_conv = SparseConvFunction.apply indice_inverse_conv = SparseInverseConvFunction.apply indice_subm_conv = SubMConvFunction.apply indice_maxpool = SparseMaxPoolFunction.apply ================================================ FILE: mmdet3d/ops/spconv/include/paramsgrid.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef PARAMS_GRID_H_ #define PARAMS_GRID_H_ #include #include namespace detail { template int getTotalSize(std::vector arg) { return arg.size(); } template int getTotalSize(std::vector arg, std::vector... args) { return arg.size() * getTotalSize(args...); } template int getSize(std::vector arg) { return arg.size(); } template void assigner(TT &src, std::vector counter, std::vector &arg) { std::get(src) = arg[counter[Idx]]; } template void assigner(TT &src, std::vector counter, std::vector &arg, std::vector &... args) { std::get(src) = arg[counter[Idx]]; assigner(src, counter, args...); } } // namespace detail template std::vector> paramsGrid(std::vector... args) { int length = detail::getTotalSize(args...); std::vector sizes = {detail::getSize(args)...}; int size = sizes.size(); std::vector> params(length); std::vector counter(size); for (int i = 0; i < length; ++i) { detail::assigner<0>(params[i], counter, args...); counter[size - 1] += 1; for (int c = size - 1; c >= 0; --c) { if (counter[c] == sizes[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return params; } #endif ================================================ FILE: mmdet3d/ops/spconv/include/prettyprint.h ================================================ // Copyright Louis Delacroix 2010 - 2014. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // // A pretty printing library for C++ // // Usage: // Include this header, and operator<< will "just work". #ifndef H_PRETTY_PRINT #define H_PRETTY_PRINT #include #include #include #include #include #include #include #include #include #include namespace pretty_print { namespace detail { // SFINAE type trait to detect whether T::const_iterator exists. struct sfinae_base { using yes = char; using no = yes[2]; }; template struct has_const_iterator : private sfinae_base { private: template static yes &test(typename C::const_iterator *); template static no &test(...); public: static const bool value = sizeof(test(nullptr)) == sizeof(yes); using type = T; }; template struct has_begin_end : private sfinae_base { private: template static yes & f(typename std::enable_if< std::is_same(&C::begin)), typename C::const_iterator (C::*)() const>::value>::type *); template static no &f(...); template static yes &g(typename std::enable_if< std::is_same(&C::end)), typename C::const_iterator (C::*)() const>::value, void>::type *); template static no &g(...); public: static bool const beg_value = sizeof(f(nullptr)) == sizeof(yes); static bool const end_value = sizeof(g(nullptr)) == sizeof(yes); }; } // namespace detail // Holds the delimiter values for a specific character type template struct delimiters_values { using char_type = TChar; const char_type *prefix; const char_type *delimiter; const char_type *postfix; }; // Defines the delimiter values for a specific container and character type template struct delimiters { using type = delimiters_values; static const type values; }; // Functor to print containers. You can use this directly if you want // to specificy a non-default delimiters type. The printing logic can // be customized by specializing the nested template. template , typename TDelimiters = delimiters> struct print_container_helper { using delimiters_type = TDelimiters; using ostream_type = std::basic_ostream; template struct printer { static void print_body(const U &c, ostream_type &stream) { using std::begin; using std::end; auto it = begin(c); const auto the_end = end(c); if (it != the_end) { for (;;) { stream << *it; if (++it == the_end) break; if (delimiters_type::values.delimiter != NULL) stream << delimiters_type::values.delimiter; } } } }; print_container_helper(const T &container) : container_(container) {} inline void operator()(ostream_type &stream) const { if (delimiters_type::values.prefix != NULL) stream << delimiters_type::values.prefix; printer::print_body(container_, stream); if (delimiters_type::values.postfix != NULL) stream << delimiters_type::values.postfix; } private: const T &container_; }; // Specialization for pairs template template struct print_container_helper::printer> { using ostream_type = typename print_container_helper::ostream_type; static void print_body(const std::pair &c, ostream_type &stream) { stream << c.first; if (print_container_helper::delimiters_type::values .delimiter != NULL) stream << print_container_helper::delimiters_type::values .delimiter; stream << c.second; } }; // Specialization for tuples template template struct print_container_helper::printer> { using ostream_type = typename print_container_helper::ostream_type; using element_type = std::tuple; template struct Int {}; static void print_body(const element_type &c, ostream_type &stream) { tuple_print(c, stream, Int<0>()); } static void tuple_print(const element_type &, ostream_type &, Int) {} static void tuple_print( const element_type &c, ostream_type &stream, typename std::conditional, std::nullptr_t>::type) { stream << std::get<0>(c); tuple_print(c, stream, Int<1>()); } template static void tuple_print(const element_type &c, ostream_type &stream, Int) { if (print_container_helper::delimiters_type::values .delimiter != NULL) stream << print_container_helper::delimiters_type::values .delimiter; stream << std::get(c); tuple_print(c, stream, Int()); } }; // Prints a print_container_helper to the specified stream. template inline std::basic_ostream &operator<<( std::basic_ostream &stream, const print_container_helper &helper) { helper(stream); return stream; } // Basic is_container template; specialize to derive from std::true_type for all // desired container types template struct is_container : public std::integral_constant::value && detail::has_begin_end::beg_value && detail::has_begin_end::end_value> {}; template struct is_container : std::true_type {}; template struct is_container : std::false_type {}; template struct is_container> : std::true_type {}; template struct is_container> : std::true_type {}; template struct is_container> : std::true_type {}; // Default delimiters template struct delimiters { static const delimiters_values values; }; template const delimiters_values delimiters::values = {"[", ", ", "]"}; template struct delimiters { static const delimiters_values values; }; template const delimiters_values delimiters::values = {L"[", L", ", L"]"}; // Delimiters for (multi)set and unordered_(multi)set template struct delimiters<::std::set, char> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::set, char>::values = {"{", ", ", "}"}; template struct delimiters<::std::set, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::set, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::multiset, char> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::multiset, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::multiset, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::multiset, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::unordered_set, char> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_set, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::unordered_set, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_set, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::unordered_multiset, char> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_multiset, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::unordered_multiset, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::unordered_multiset, wchar_t>::values = {L"{", L", ", L"}"}; // Delimiters for pair and tuple template struct delimiters, char> { static const delimiters_values values; }; template const delimiters_values delimiters, char>::values = { "(", ", ", ")"}; template struct delimiters<::std::pair, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::pair, wchar_t>::values = {L"(", L", ", L")"}; template struct delimiters, char> { static const delimiters_values values; }; template const delimiters_values delimiters, char>::values = { "(", ", ", ")"}; template struct delimiters<::std::tuple, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::tuple, wchar_t>::values = {L"(", L", ", L")"}; // Type-erasing helper class for easy use of custom delimiters. // Requires TCharTraits = std::char_traits and TChar = char or wchar_t, // and MyDelims needs to be defined for TChar. Usage: "cout << // pretty_print::custom_delims(x)". struct custom_delims_base { virtual ~custom_delims_base() {} virtual std::ostream &stream(::std::ostream &) = 0; virtual std::wostream &stream(::std::wostream &) = 0; }; template struct custom_delims_wrapper : custom_delims_base { custom_delims_wrapper(const T &t_) : t(t_) {} std::ostream &stream(std::ostream &s) { return s << print_container_helper, Delims>( t); } std::wostream &stream(std::wostream &s) { return s << print_container_helper, Delims>(t); } private: const T &t; }; template struct custom_delims { template custom_delims(const Container &c) : base(new custom_delims_wrapper(c)) {} std::unique_ptr base; }; template inline std::basic_ostream &operator<<( std::basic_ostream &s, const custom_delims &p) { return p.base->stream(s); } // A wrapper for a C-style array given as pointer-plus-size. // Usage: std::cout << pretty_print_array(arr, n) << std::endl; template struct array_wrapper_n { typedef const T *const_iterator; typedef T value_type; array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {} inline const_iterator begin() const { return _array; } inline const_iterator end() const { return _array + _n; } private: const T *const _array; size_t _n; }; // A wrapper for hash-table based containers that offer local iterators to each // bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket // 5 of container m.) template struct bucket_print_wrapper { typedef typename T::const_local_iterator const_iterator; typedef typename T::size_type size_type; const_iterator begin() const { return m_map.cbegin(n); } const_iterator end() const { return m_map.cend(n); } bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {} private: const T &m_map; const size_type n; }; } // namespace pretty_print // Global accessor functions for the convenience wrappers template inline pretty_print::array_wrapper_n pretty_print_array(const T *const a, size_t n) { return pretty_print::array_wrapper_n(a, n); } template pretty_print::bucket_print_wrapper bucket_print(const T &m, typename T::size_type n) { return pretty_print::bucket_print_wrapper(m, n); } // Main magic entry point: An overload snuck into namespace std. // Can we do better? namespace std { // Prints a container to the stream using default delimiters template inline typename enable_if<::pretty_print::is_container::value, basic_ostream &>::type operator<<(basic_ostream &stream, const T &container) { return stream << ::pretty_print::print_container_helper( container); } } // namespace std #endif // H_PRETTY_PRINT ================================================ FILE: mmdet3d/ops/spconv/include/pybind11_utils.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include // everything needed for embedding #include #include #include #include #include namespace py = pybind11; template std::vector array2Vector(TPyObject arr){ py::array arr_np = arr; size_t size = arr.attr("size").template cast(); py::array_t arr_cc = arr_np; std::vector data(arr_cc.data(), arr_cc.data() + size); return data; } template std::vector arrayT2Vector(py::array_t arr) { std::vector data(arr.data(), arr.data() + arr.size()); return data; } template tv::TensorView array2TensorView(TPyObject arr){ py::array arr_np = arr; py::array_t arr_cc = arr_np; tv::Shape shape; for (int i = 0; i < arr_cc.ndim(); ++i){ shape.push_back(arr_cc.shape(i)); } return tv::TensorView(arr_cc.mutable_data(), shape); } template tv::TensorView arrayT2TensorView(py::array_t arr){ tv::Shape shape; for (int i = 0; i < arr.ndim(); ++i){ shape.push_back(arr.shape(i)); } return tv::TensorView(arr.mutable_data(), shape); } ================================================ FILE: mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef FUSED_SPARSE_CONV_OP_H_ #define FUSED_SPARSE_CONV_OP_H_ #include #include #include #include #include #include namespace spconv { // torch.jit's doc says only support int64, so we need to convert to int32. template torch::Tensor fusedIndiceConvBatchNorm( torch::Tensor features, torch::Tensor filters, torch::Tensor bias, torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse, int64_t _subM) { bool subM = _subM != 0; bool inverse = _inverse != 0; auto device = features.device().type(); auto ndim = filters.dim() - 2; auto kernelVolume = indicePairs.size(0); auto numInPlanes = features.size(1); auto numOutPlanes = filters.size(ndim + 1); auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairMaxSizeIter = std::max_element(indicePairNumCpu.data_ptr(), indicePairNumCpu.data_ptr() + kernelVolume); int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data_ptr(); int indicePairMaxSize = *indicePairMaxSizeIter; /*if (_subM){ std::vector indicePairNumVec(indicePairNumCpu.data_ptr(), indicePairNumCpu.data_ptr() + kernelVolume); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); auto indicePairVecMaxSizeIter = std::max_element( indicePairNumVec.begin(), indicePairNumVec.end()); indicePairMaxSize = *indicePairVecMaxSizeIter; }*/ auto options = torch::TensorOptions().dtype(features.dtype()).device(features.device()); // auto indicePairOptions = // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias); torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::Tensor outputBuffer = torch::zeros({indicePairMaxSize, numOutPlanes}, options); filters = filters.view({-1, numInPlanes, numOutPlanes}); if (subM) { // the center index of subm conv don't need gather and scatter // add. torch::mm_out(output, features, filters[indicePairMaxOffset]); } double totalGatherTime = 0; double totalGEMMTime = 0; double totalSAddTime = 0; for (int i = 0; i < kernelVolume; ++i) { auto nHot = indicePairNumCpu.data_ptr()[i]; if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { continue; } // auto timer = spconv::CudaContextTimer<>(); auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(), {nHot, numOutPlanes}, options); auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options); if (device == torch::kCPU) { functor::SparseGatherFunctor gatherFtor; gatherFtor(tv::CPU(), tv::torch2tv(inputBuffer), tv::torch2tv(features), tv::torch2tv(indicePairs).subview(i, inverse), nHot); } else { functor::SparseGatherFunctor gatherFtor; gatherFtor(tv::TorchGPU(), tv::torch2tv(inputBuffer), tv::torch2tv(features), tv::torch2tv(indicePairs).subview(i, inverse), nHot); TV_CHECK_CUDA_ERR(); /* slower than SparseGatherFunctor, may due to int->long conversion auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr(), {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob, features, 0, indicePairBlob);*/ } // totalGatherTime += timer.report() / 1000.0; torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); // totalGEMMTime += timer.report() / 1000.0; if (device == torch::kCPU) { functor::SparseScatterAddFunctor scatterFtor; scatterFtor(tv::CPU(), tv::torch2tv(output), tv::torch2tv(outputBuffer), tv::torch2tv(indicePairs).subview(i, !inverse), nHot, true); } else { functor::SparseScatterAddFunctor scatterFtor; scatterFtor(tv::TorchGPU(), tv::torch2tv(output), tv::torch2tv(outputBuffer), tv::torch2tv(indicePairs).subview(i, !inverse), nHot, true); TV_CHECK_CUDA_ERR(); } // totalSAddTime += timer.report() / 1000.0; } // std::cout << "gather time " << totalGatherTime << std::endl; // std::cout << "gemm time " << totalGEMMTime << std::endl; // std::cout << "scatteradd time " << totalSAddTime << std::endl; return output; } } // namespace spconv #endif ================================================ FILE: mmdet3d/ops/spconv/include/spconv/geometry.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPCONV_GEOMETRY_H_ #define SPCONV_GEOMETRY_H_ #include #include #include namespace spconv { template TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape, Index *out) { Index lowers[NDim]; Index uppers[NDim]; Index counter[NDim]; Index counterSize[NDim]; Index pointCounter = 0; Index val; Index numPoints = 1; Index m, offset; bool valid = false; #pragma unroll for (int i = 0; i < NDim; ++i) { lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i]; uppers[i] = (input_pos[i] + padding[i]) / stride[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); numPoints *= counterSize[i]; } #pragma unroll for (int i = 0; i < NDim; ++i) { counter[i] = 0; } for (int i = 0; i < numPoints; ++i) { valid = true; m = 1; offset = 0; #pragma unroll for (int j = NDim - 1; j >= 0; --j) { val = uppers[j] - counter[j] * dilation[j]; out[pointCounter * (NDim + 1) + j] = val; if (val < 0 || (val > outSpatialShape[j] - 1)) { valid = false; // break; } offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; m *= kernelSize[j]; } out[pointCounter * (NDim + 1) + NDim] = offset; if (valid) ++pointCounter; counter[NDim - 1] += 1; #pragma unroll for (int c = NDim - 1; c >= 0; --c) { if (counter[c] == counterSize[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return pointCounter; } template TV_HOST_DEVICE Index getValidOutPosTranspose( const Index *input_pos, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape, Index *out) { Index lowers[NDim]; Index uppers[NDim]; Index counter[NDim]; Index counterSize[NDim]; Index pointCounter = 0; Index val; Index numPoints = 1; Index m, offset; bool valid = false; #pragma unroll for (int i = 0; i < NDim; ++i) { lowers[i] = input_pos[i] * stride[i] - padding[i]; uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); numPoints *= counterSize[i]; } #pragma unroll for (int i = 0; i < NDim; ++i) { counter[i] = 0; } for (int i = 0; i < numPoints; ++i) { valid = true; m = 1; offset = 0; #pragma unroll for (int j = NDim - 1; j >= 0; --j) { val = uppers[j] - counter[j] * dilation[j]; out[pointCounter * (NDim + 1) + j] = val; if (val < 0 || (val > outSpatialShape[j] - 1)) { valid = false; // break; } offset += m * (val - lowers[j]) / dilation[j]; m *= kernelSize[j]; } out[pointCounter * (NDim + 1) + NDim] = offset; if (valid) ++pointCounter; counter[NDim - 1] += 1; #pragma unroll for (int c = NDim - 1; c >= 0; --c) { if (counter[c] == counterSize[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return pointCounter; } template Index getIndicePairsConv(tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape) { // indicesOut: num_active * kernelVolume * (NDim + 1) Index numAct = 0; auto numActIn = indicesIn.dim(0); Index batchIdx = 0; Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; for (int j = 0; j < numActIn; ++j) { batchIdx = indicesIn(j, 0); numValidPoints = getValidOutPos( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * batchIdx; if (gridsOut[index] == -1) { for (unsigned k = 1; k < NDim + 1; ++k) { indicesOut(numAct, k) = pointPtr[k - 1]; } indicesOut(numAct, 0) = batchIdx; gridsOut[index] = numAct++; } // indicePairs: [K, 2, L] indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } return numAct; } template Index getIndicePairsDeConv(tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape) { Index numAct = 0; auto numActIn = indicesIn.dim(0); Index batchIdx = 0; Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; for (int j = 0; j < numActIn; ++j) { batchIdx = indicesIn(j, 0); numValidPoints = getValidOutPosTranspose( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * batchIdx; if (gridsOut[index] == -1) { for (unsigned k = 1; k < NDim + 1; ++k) { indicesOut(numAct, k) = pointPtr[k - 1]; } indicesOut(numAct, 0) = batchIdx; gridsOut[index] = numAct++; } // indicePairs: [K, 2, L] indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } return numAct; } template Index getIndicePairsSubM(tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *const kernelSize, const Index *const stride, const Index *const padding, const Index *dilation, const Index *const outSpatialShape) { Index numAct = 0; auto numActIn = indicesIn.dim(0); Index batchIdx = 0; Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; // Index validPoints[kernelVolume * (NDim + 1)]; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; Index index = 0; for (int j = 0; j < numActIn; ++j) { index = tv::rowArrayIdx(indicesIn.data() + j * (NDim + 1) + 1, outSpatialShape) + spatialVolume * indicesIn(j, 0); gridsOut[index] = j; } for (int j = 0; j < numActIn; ++j) { numValidPoints = getValidOutPos( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * indicesIn(j, 0); if (gridsOut[index] > -1) { indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } } return numActIn; } } // namespace spconv #endif ================================================ FILE: mmdet3d/ops/spconv/include/spconv/indice.cu.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef INDICE_CU_H_ #define INDICE_CU_H_ #include #include #include namespace spconv { template __global__ void prepareIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void prepareDeConvIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPosTranspose( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void assignGridAndIndiceOutKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numAct, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape, int batchSize) { Index index; auto indicesOutPtr = indicesOut.data(); for (int ix : tv::KernelLoopX(numAct)) { index = indicePairUnique[ix]; gridsOut[index] = ix; index = tv::rowArrayIdxInv( index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); indicesOut[ix * (NDim + 1)] = index % batchSize; } } template __global__ void assignIndicePairsKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numActIn, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape) { Index index; int kernelVolume = indicePairs.dim(0); for (int ix : tv::KernelLoopX(numActIn)) { for (int i = 0; i < kernelVolume; ++i) { index = indicePairs(i, 1, ix); if (index > -1) { indicePairs(i, 1, ix) = gridsOut[index]; } } } } template __global__ void prepareSubMGridKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); gridsOut[index] = ix; } } template __global__ void getSubMIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (int i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); if (gridsOut[index] > -1) { auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 1, oldNum) = gridsOut[index]; indicePairs(offset, 0, oldNum) = ix; } } } } template __global__ void resetGridKernel(const Index *indicePairUnique, tv::TensorView gridsOut, int numAct) { for (int ix : tv::KernelLoopX(numAct)) { gridsOut[indicePairUnique[ix]] = -1; } } template __global__ void resetGridSubMKernel( const Index *indices, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape, int numAct) { int outSpatialShapeReg[NDim]; for (int i = 0; i < NDim; ++i) { outSpatialShapeReg[i] = outSpatialShape[i]; } Index spatialVolume = 1; auto indsPtr = indices; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index; for (int ix : tv::KernelLoopX(numAct)) { indsPtr = indices + ix * (NDim + 1); index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); gridsOut[index + spatialVolume * indsPtr[0]] = -1; } } } // namespace spconv #endif ================================================ FILE: mmdet3d/ops/spconv/include/spconv/indice.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ #define SPARSE_CONV_INDICE_FUNCTOR_H_ #include namespace spconv { namespace functor { template struct CreateConvIndicePairFunctorP1 { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose); }; template struct CreateConvIndicePairFunctorP2 { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; template struct CreateConvIndicePairFunctor { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; template struct CreateSubMIndicePairFunctor { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; } // namespace functor } // namespace spconv #endif ================================================ FILE: mmdet3d/ops/spconv/include/spconv/maxpool.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPARSE_MAXPOOL_FUNCTOR_H_ #define SPARSE_MAXPOOL_FUNCTOR_H_ #include namespace spconv { namespace functor { template struct SparseMaxPoolForwardFunctor { void operator()(const Device& d, tv::TensorView outFeatures, tv::TensorView inFeatures, tv::TensorView indices, int size); }; template struct SparseMaxPoolBackwardFunctor { void operator()(const Device& d, tv::TensorView outFeatures, tv::TensorView inFeatures, tv::TensorView dout, tv::TensorView din, tv::TensorView indices, int size); }; } // namespace functor } // namespace spconv #endif ================================================ FILE: mmdet3d/ops/spconv/include/spconv/mp_helper.h ================================================ #ifndef MP_HELPER_H_ #define MP_HELPER_H_ #include #include namespace spconv { template struct mp_list {}; template using mp_list_c = mp_list...>; namespace detail { template constexpr F mp_for_each_impl(mp_list, F &&f) { return std::initializer_list{(f(T()), 0)...}, std::forward(f); } template constexpr F mp_for_each_impl(mp_list<>, F &&f) { return std::forward(f); } } // namespace detail namespace detail { template class B> struct mp_rename_impl { // An error "no type named 'type'" here means that the first argument to // mp_rename is not a list }; template